diff --git a/.gitignore b/.gitignore
index 82951579..8fd8aa8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.pre-commit-config.yaml
 .DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/bench/qc_simulation/.gitignore b/bench/qc_simulation/.gitignore
new file mode 100644
index 00000000..8fce6030
--- /dev/null
+++ b/bench/qc_simulation/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/bench/qc_simulation/README.md b/bench/qc_simulation/README.md
new file mode 100644
index 00000000..b7616296
--- /dev/null
+++ b/bench/qc_simulation/README.md
@@ -0,0 +1,67 @@
+
+## Examples
+
+1. generate or download circuits:
+
+* As tar `./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_11.tar.gz data/circuits/bris11/\{in_file\}.circ` (need to unzip)
+* Using http and [unzip on the fly](./scripts/http_unzip_on_the_fly.sh)
+* generate `./main.py generate data/circuits/qaoa/maxcut_regular_N{N}_p{p} --type=qaoa_maxcut --N=8,12,16,24,32,48,64 --p=1,2,3,4,5 --d=3`
+
+2. preprocess using both of `greedy` and `rgreedy` algorithms:
+`./main.py preprocess data/circuits/qaoa/maxcut_regular\* data/preprocess/maxcut/\{in_file\}_oalgo{O}.circ --O=greedy,rgreedy --sim=qtensor
+`
+3. Simulate: `./main.py simulate ./data/preprocess/maxcut/maxcut_regular\* data/simulations/maxcut/{in_file}_comp_m{M} --sim qtensor -M 25 --backend=cupy --compress=szx`
+
+### Easily manage simulation and estimation results
+
+After running preprocess, one can estimate runtime and compare that to actual time to simulate
+```bash
+# Assume 1GFlop (low-end cpu number)
+./main.py estimate preprocess/bris/bris_\*.txt_oalgogreedy.circ estimations/bris/cpu --sim qtensor -M 27 -F 1e9
+./main.py estimate preprocess/bris/bris_\*.txt_oalgorgreedy.circ estimations/bris/cpu --sim qtensor -M 27 -F 1e9
+
+rm  -r simulations/bris/*
+# Simulate Greedy
+./main.py simulate preprocess/bris/bris_\*.txt_oalgogreedy.circ simulations/bris --sim qtensor -M 27
+# Simulate RGreedy
+./main.py simulate preprocess/bris/bris_\*.txt_oalgorgreedy.circ simulations/bris --sim qtensor -M 27
+cat simulations/bris/*rgreedy*
+cat estimations/bris/cpu/*rgreedy*
+cat simulations/bris/*greedy*
+cat estimations/bris/cpu/*greedy*
+```
+
+This shows how UNIX utilities are used to filter and present data. In SQL this would be something like
+`SELECT * FROM simulations WHERE ordering_algo="greedy"`. 
+
+## Filetypes
+
+- `.txt` - gate sequence as in GRCS
+- `.qasm` - openqasm file
+- `.jsonterms` - json file of QAOA terms (`src/circuit_gen/qaoa.py`)
+
+## Advanced usage
+
+It is possible to glob over inputs and vectorize over outputs
+The globbing is possible over remote files
+
+```
+main.py process \
+    gh://example.com/data/*/*.element \
+    results/{X}/{in_file}_y{y}.r \
+    -X=1,2 --Y=foo,bar
+```
+
+The parent directory for each out file will be created automatically
+
+
+## Analysis
+
+Simple simulation analysis script: `analysis/compression_scaling_analysis.py`.
+Accepts a glob pattern for simulation output files
+
+Usage:
+
+```
+python analysis/compression_scaling_analysis.py ./data/simulations/maxcut/file\*
+``
diff --git a/bench/qc_simulation/analysis/compression_scaling_analysis.py b/bench/qc_simulation/analysis/compression_scaling_analysis.py
new file mode 100644
index 00000000..1e246c2b
--- /dev/null
+++ b/bench/qc_simulation/analysis/compression_scaling_analysis.py
@@ -0,0 +1,46 @@
+import glob
+import pandas as pd
+import json
+import numpy as np
+import sys
+
+def fmt_unit(x, unit):
+    return str(np.round(x, 2)) + " " + unit
+
+def main():
+    glob_pat = sys.argv[1]
+    filenames = glob.glob(glob_pat)
+    filenames = sorted(filenames)
+
+    for file in filenames:
+        data = json.load(open(file))
+        stats = {}
+        for atr in ["compress", "decompress"]:
+            items = data["compression"][atr]
+            if len(items)==0:
+                continue
+            df = pd.DataFrame(items)
+            df["CR"] = df["size_in"]/df["size_out"]
+            df["T"] = df["size_in"]/df["time"]
+            stats["mean " + atr+" CR"] = df["CR"].mean()
+            stats["mean " + atr+" Throughput"] = fmt_unit(df["T"].mean( )/1e9, "GB/s")
+            stats[atr+" Count"] = len(df)
+
+        _res = data["result"]
+        stats["result"] = (_res["Re"] , _res["Im"])
+        stats["Time"] = fmt_unit(data["time"],'s')
+        stats["Memory"] = str(data["memory"]/1024/1024) + " MB"
+        if data.get('nvmemory'):
+            stats["NVMemory"] = str(data["nvmemory"]/1024/1024) + " MB"
+        print(file)
+        _prefix = "  "
+        last = lambda x: x==len(stats.items())-1
+        char = lambda i: "⎬ " if not last(i) else "┕ "
+        print("\n".join([
+            _prefix+char(i) + " = ".join(map(str, items))
+            for i, items in enumerate(stats.items())
+        ]))
+
+
+if __name__=="__main__":
+    main()
diff --git a/bench/qc_simulation/analysis/simple_compression_report.py b/bench/qc_simulation/analysis/simple_compression_report.py
new file mode 100644
index 00000000..864574c5
--- /dev/null
+++ b/bench/qc_simulation/analysis/simple_compression_report.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import json
+import sys
+
+def main():
+    file = sys.argv[1]
+    data = json.load(open(file))
+    rows = []
+    for item in data['compression']['compress']:
+        k = item.copy()
+        k['type']='compress'
+        rows.append(k)
+
+    for item in data['compression']['decompress']:
+        k = item.copy()
+        k['type']='decompress'
+        rows.append(k)
+
+    if len(rows) == 0:
+        print("Rows:\n", rows)
+        return
+    df = pd.DataFrame(rows)
+    dfc = df[df['type'] == 'compress']
+    dfd = df[df['type'] == 'decompress']
+
+    for d in [dfc, dfd]:
+        d['Throughput'] = d['size_in'] / d['time']
+        d['CR'] = d['size_in'] / d['size_out']
+
+    print("Compression:")
+    print(dfc.describe([0.5]))
+    print("Decompression:")
+    print(dfd.describe([0.5]))
+
+if __name__=="__main__":
+    main()
diff --git a/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30 b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30
new file mode 100644
index 00000000..7485593d
Binary files /dev/null and b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N256_p1.jsonterms_Otamaki_120_M30 differ
diff --git a/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30 b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30
new file mode 100644
index 00000000..71f0e809
Binary files /dev/null and b/bench/qc_simulation/data/preprocess/qaoa_maxcut/3reg_N52_p3.jsonterms_Otamaki_3_M30 differ
diff --git a/bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30 b/bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30
new file mode 100644
index 00000000..82da261a
Binary files /dev/null and b/bench/qc_simulation/data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30 differ
diff --git a/bench/qc_simulation/main.py b/bench/qc_simulation/main.py
new file mode 100755
index 00000000..7203b4fe
--- /dev/null
+++ b/bench/qc_simulation/main.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+from functools import wraps
+import fire
+def log(*args):
+    print(f"[main.py] ", *args, file=sys.stderr, flush=True)
+
+# -- Utils
+
+import pandas as pd
+import fsspec
+import itertools
+from dataclasses import dataclass
+import io
+
+@dataclass
+class File:
+    path: Path
+    f: io.IOBase
+
+def general_glob(urlpath, **kwargs):
+    """General glob function to handle local and remote paths."""
+    filelist = fsspec.open_files(urlpath, **kwargs)
+    for file in filelist:
+        yield file
+
+def is_sequence(x):
+    if isinstance(x, str):
+        return False
+    try:
+        iter(x)
+        return True
+    except TypeError:
+        return False
+
+def dict_vector_iter(**d):
+    """
+    For each value that is a list in dict d, iterate over all possible
+    combinations of values.
+    """
+    keys = d.keys()
+    vals = d.values()
+    vector_keys = [k for k, v in zip(keys, vals) if is_sequence(v)]
+    vector_vals = [v for v in vals if is_sequence(v)]
+    for instance in itertools.product(*vector_vals):
+        p = dict(d)
+        p.update(zip(vector_keys, instance))
+        yield p
+
+def general_indexed(in_path, out_path, func, fsspec_kwargs={},  **kwargs):
+    """
+    Arguments:
+        in_path: a glob-like urlpath to pass to fsspec.open_files
+        out_path: a string to store the output into. Optionally,
+            can provide formatting arguments
+            If no formatting arguments provided, will be treated as a directory,
+            I.E `<out_path>/{in_file}`
+            otherwise, will be treated as a file, I.E. `<out_path>.format(**kwargs)`
+            For many input files, the {in_file} argument will be provided.
+            This will be passed as the second argument to the function
+        func: a function that takes two arguments, the first being the input
+            file object, and the second being the output file.
+        fsspec_kwargs: kwargs to pass to fsspec.open_files
+    """
+    # If no formatting arguments provided, treat as directory
+    if "{" not in out_path:
+        out_pattern = f"{out_path}/{{in_file}}"
+    else:
+        out_pattern = out_path
+
+    def unit(kwargs):
+        in_file = kwargs.pop("in_file")
+        in_path = Path(in_file.path)
+        out_file = out_pattern.format(
+            in_path=in_path,
+            in_file=in_path.name,
+            **kwargs)
+        out_path = Path(out_file)
+        # make parent dir
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        with in_file.open() as f:
+            fl = File(in_path, f)
+            changed_out = func(fl, out_file, **kwargs)
+        
+        log(f"{in_file.path} -> [{func.__name__}] -> {changed_out}")
+        index_file = Path(changed_out).parent / "index.csv"
+        update_index(index_file, input=in_file.path, output=changed_out, **kwargs)
+        return changed_out
+
+    
+    in_path = in_path.format(**kwargs)
+    files = iter(general_glob(in_path, **fsspec_kwargs))
+    combinations = iter(dict_vector_iter(in_file=files, **kwargs))
+    return list(map(unit, combinations))
+
+def update_index(index_file, **kwargs):
+    df = pd.DataFrame(kwargs, index=[0])
+    # check if index file exists
+    if not (file := Path(index_file)).exists():
+        # create directories if needed
+        file.parent.mkdir(parents=True, exist_ok=True)
+
+        print("Creating index file")
+        df.to_csv(index_file, header=True, index=False)
+    else:
+        df_exist = pd.read_csv(index_file, nrows=2)
+        if isinstance(df_exist, pd.DataFrame):
+            if df_exist.columns.tolist() != df.columns.tolist():
+                raise ValueError("Index file already exists but has different columns")
+        # append to csv
+        print(f"Appending to index file {index_file}")
+        df.to_csv(index_file, mode="a", header=False, index=False)
+# --
+
+from src.simulators.qtensor import preprocess as qtensor_preprocess
+from src.simulators.qtensor import estimate as qtensor_estimate
+from src.simulators.qtensor import simulate as qtensor_simulate
+from src.simulators.qtensor_energy import simulate as qtensor_simulate_energy
+from src.simulators.qtensor_energy import preprocess as qtensor_preprocess_energy
+from src.circuit_gen.qaoa import generate_maxcut
+
+# -- Main
+sim_preprocessors = {
+    'qtensor': qtensor_preprocess,
+    'qtensor_energy': qtensor_preprocess_energy
+}
+
+sim_estimators = {
+    'qtensor': qtensor_estimate
+}
+
+sim_simulators = {
+    'qtensor': qtensor_simulate,
+    'qtensor_energy': qtensor_simulate_energy
+}
+
+circ_generators = {
+    'qaoa_maxcut': generate_maxcut
+}
+class Main:
+
+    def echo(self, in_path, out_dir, **kwargs):
+        """
+         Simple mapper that just echoes stuff
+         """
+        @wraps(self.echo)
+        def unit(in_file, out_file, **kwargs):
+            with open(out_file, "wb") as f:
+                f.write(in_file.f.read())
+            return out_file
+        general_indexed(in_path, out_dir, unit, **kwargs)
+
+    def generate(self, out_dir, type, **kwargs):
+        @wraps(self.generate)
+        def unit(in_file, out_file, type, **kwargs):
+            circ_generators[type](out_file, **kwargs)
+            return out_file
+        general_indexed('/dev/null', out_dir, unit, type=type, **kwargs)
+
+    def preprocess(self, in_path, out_dir, sim='qtensor', **kwargs):
+        @wraps(self.preprocess)
+        def unit(in_file, out_file, sim, **kwargs):
+            sim_preprocessors[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+    def estimate(self, in_path, out_dir, sim='qtensor', **kwargs):
+        """
+        Estimate the parameters of a simulator
+        """
+        @wraps(self.estimate)
+        def unit(in_file, out_file, sim, **kwargs):
+            sim_estimators[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+    if estimate.__doc__:
+        # Modify doc to include info about additional parameters
+        estimate.__doc__ += f"\n{qtensor_estimate.__doc__.replace('Arguments:', 'Additional:')}"
+
+    def simulate(self, in_path, out_dir, sim='qtensor', **kwargs):
+        """
+        Simulate the quantum circuit
+        """
+        @wraps(self.simulate)
+        def unit(in_file, out_file, **kwargs):
+            sim_simulators[sim](in_file, out_file, **kwargs)
+            return out_file
+        general_indexed(in_path, out_dir, unit, sim=sim, **kwargs)
+
+
+if __name__ == "__main__":
+    fire.core.Display = lambda lines, out: print(*lines, file=out)
+    fire.Fire(Main)
diff --git a/bench/qc_simulation/qtensor/test_circuits.py b/bench/qc_simulation/qtensor/test_circuits.py
deleted file mode 100644
index 884ca10c..00000000
--- a/bench/qc_simulation/qtensor/test_circuits.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import qtensor
-import numpy as np
-import networkx as nx
-
-def get_qaoa_graph_params(n=10, p=2, d=3, type='random', seed=10):
-    if type == 'random':
-        G = nx.random_regular_graph(d, n, seed=seed)
-    elif type == 'grid2d':
-        G = nx.grid_2d_graph(n,n)
-    elif type == 'line':
-        G = nx.Graph()
-        G.add_edges_from(zip(range(n-1), range(1, n)))
-    gamma, beta = [np.pi/5]*p, [np.pi/2]*p
-    return G, gamma, beta
-
-def gen_qaoa_maxcut_circuit(n=10, p=2, d=3, type='random', seed=10):
-    G, gamma, beta = get_qaoa_graph_params(n, p, d, type, seed)
-    composer = qtensor.QtreeQAOAComposer(graph=G, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
diff --git a/bench/qc_simulation/requirements.txt b/bench/qc_simulation/requirements.txt
new file mode 100644
index 00000000..520efee0
--- /dev/null
+++ b/bench/qc_simulation/requirements.txt
@@ -0,0 +1,6 @@
+fire
+fsspec
+pandas
+qiskit
+aiohttp
+cupy
diff --git a/bench/qc_simulation/scripts/README.md b/bench/qc_simulation/scripts/README.md
new file mode 100644
index 00000000..d452df23
--- /dev/null
+++ b/bench/qc_simulation/scripts/README.md
@@ -0,0 +1,36 @@
+# Scripts
+
+These are example and helper scripts
+
+## Examples
+
+### Download via http, unpack on the fly
+
+```
+╰─λ ./scripts/http_unzip_on_the_fly.sh
+[main.py]  bris_5_24_0.txt -> [echo] -> circuits/bris/bris_5_24_0.txt_dummy1.circ
+[main.py]  bris_5_24_0.txt -> [echo] -> circuits/bris/bris_5_24_0.txt_dummy2.circ
+[main.py]  bris_5_28_0.txt -> [echo] -> circuits/bris/bris_5_28_0.txt_dummy1.circ
+[main.py]  bris_5_28_0.txt -> [echo] -> circuits/bris/bris_5_28_0.txt_dummy2.circ
+[main.py]  bris_5_32_0.txt -> [echo] -> circuits/bris/bris_5_32_0.txt_dummy1.circ
+[main.py]  bris_5_32_0.txt -> [echo] -> circuits/bris/bris_5_32_0.txt_dummy2.circ
+[main.py]  bris_5_36_0.txt -> [echo] -> circuits/bris/bris_5_36_0.txt_dummy1.circ
+[main.py]  bris_5_36_0.txt -> [echo] -> circuits/bris/bris_5_36_0.txt_dummy2.circ
+[main.py]  bris_5_40_0.txt -> [echo] -> circuits/bris/bris_5_40_0.txt_dummy1.circ
+[main.py]  bris_5_40_0.txt -> [echo] -> circuits/bris/bris_5_40_0.txt_dummy2.circ
+╰─λ tree circuits/
+circuits/
+└── bris
+    ├── bris_5_24_0.txt_dummy1.circ
+    ├── bris_5_24_0.txt_dummy2.circ
+    ├── bris_5_28_0.txt_dummy1.circ
+    ├── bris_5_28_0.txt_dummy2.circ
+    ├── bris_5_32_0.txt_dummy1.circ
+    ├── bris_5_32_0.txt_dummy2.circ
+    ├── bris_5_36_0.txt_dummy1.circ
+    ├── bris_5_36_0.txt_dummy2.circ
+    ├── bris_5_40_0.txt_dummy1.circ
+    └── bris_5_40_0.txt_dummy2.circ
+
+2 directories, 10 file
+```
diff --git a/bench/qc_simulation/scripts/download_from_gh.sh b/bench/qc_simulation/scripts/download_from_gh.sh
new file mode 100755
index 00000000..2d7e3c51
--- /dev/null
+++ b/bench/qc_simulation/scripts/download_from_gh.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#
+#./main.py echo tar://bris_5/bris*_24_0.txt::github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_5.tar.gz circuits/bris/\{in_file\}_dummy\{dummy\}.circ --dummy=1,2
+./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_11.tar.gz data/circuits/bris11/\{in_file\}.circ 
diff --git a/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh b/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh
new file mode 100755
index 00000000..458b5237
--- /dev/null
+++ b/bench/qc_simulation/scripts/generate_qaoa_maxcut.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py generate data/circuits/qaoa/maxcut_regular_N{N}_p{p} --type=qaoa_maxcut --N=8,12,16,24,32,48,64 --p=1,2,3,4,5 --d=3
diff --git a/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
new file mode 100755
index 00000000..2c7a379e
--- /dev/null
+++ b/bench/qc_simulation/scripts/http_unzip_on_the_fly.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py echo tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/circuits/bris/\{in_file\}.circ
diff --git a/bench/qc_simulation/scripts/large_run.py b/bench/qc_simulation/scripts/large_run.py
new file mode 100755
index 00000000..a94a0b87
--- /dev/null
+++ b/bench/qc_simulation/scripts/large_run.py
@@ -0,0 +1,3 @@
+#!/bin/bash
+./main.py simulate ./data/preprocess/sc23/qaoa/3reg_N72_p3.jsonterms_Otamaki_30_M30 ./data/simulations/sc23/large/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-4 --r2r_threshold=5e-4 --mpi
+
diff --git a/bench/qc_simulation/scripts/mpi_debug.sh b/bench/qc_simulation/scripts/mpi_debug.sh
new file mode 100755
index 00000000..18684e8a
--- /dev/null
+++ b/bench/qc_simulation/scripts/mpi_debug.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+mpiexec -n 4 ./main.py simulate ./data/preprocess/mpi_debug/qaoa/3reg_N42_p4.jsonterms_Otamaki_8_M29  ./data/simulations/mpi_debug/{in_file}_cM{M}_rE{r2r_threshold}.sim --sim qtensor -M 27 --backend=cupy --compress=szx --r2r_error=5e-5 --r2r_threshold=5e-5 --mpi
+
diff --git a/bench/qc_simulation/scripts/polaris/entry.sh b/bench/qc_simulation/scripts/polaris/entry.sh
new file mode 100755
index 00000000..cce730ae
--- /dev/null
+++ b/bench/qc_simulation/scripts/polaris/entry.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -l
+#
+echo "[entry.sh] JOB $PBS_JOBID Start. PARAM_P=$PARAM_P RANKS=$RANKS"
+module load conda cray-mpich cudatoolkit-standalone
+conda activate
+
+cd $PBS_O_WORKDIR
+echo "[entry.sh] Current workdir $PWD"
+echo "[entry.sh] Hostname: `hostname`"
+echo "[entry.sh] Parameter p: $PARAM_P"
+echo "[entry.sh] Ranks: $RANKS"
+export CUDA_HOME=/soft/compilers/nvidia/Linux_x86_64/2022/cuda/11.0
+export PARAM_P
+
+time mpiexec -n $RANKS --ppn 4 ./scripts/large_run.py
+echo "[entry.sh] JOB $PBS_JOBID Done."
diff --git a/bench/qc_simulation/scripts/polaris/submit.sh b/bench/qc_simulation/scripts/polaris/submit.sh
new file mode 100755
index 00000000..550cebf4
--- /dev/null
+++ b/bench/qc_simulation/scripts/polaris/submit.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+
+NODES=2
+RANKS=$(( NODES * 4 ))
+QUEUE=debug-scaling
+WALLTIME=40:00
+
+qsub -l select=$NODES:system=polaris:ncpus=32:ngpus=4:gputype=A100,walltime=$WALLTIME,filesystems=home \
+    -q $QUEUE -AQTensor \
+    -v RANKS=$RANKS,PARAM_P=$PARAM_P \
+    -o job_out.output -e job_out.output \
+    ./scripts/polaris/entry.sh
+
+echo -e "===========\nNew job with NODES=$NODES, PARAM_P=$PARAM_P submitted.\n" >> job_out.output
+sleep 1.5
+tail -f job_out.output
+
diff --git a/bench/qc_simulation/scripts/preprocess_qtensor.sh b/bench/qc_simulation/scripts/preprocess_qtensor.sh
new file mode 100755
index 00000000..41a45b7d
--- /dev/null
+++ b/bench/qc_simulation/scripts/preprocess_qtensor.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py preprocess tar://*0.txt::https://github.com/danlkv/GRCS/raw/master/inst/bristlecone/cz_v2/bris_5.tar.gz data/preprocess/bris/\{in_file\}_oalgo{O}.circ --O=greedy,rgreedy --sim=qtensor
diff --git a/bench/qc_simulation/scripts/simple_test.sh b/bench/qc_simulation/scripts/simple_test.sh
new file mode 100755
index 00000000..ac8409cb
--- /dev/null
+++ b/bench/qc_simulation/scripts/simple_test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+./main.py echo github://danlkv:GRCS@/inst/bristlecone/cz_v2/bris_9/*0.txt data/circuits/bris/\{in_file\}_dummy\{dummy\}.circ --dummy=1,2
diff --git a/bench/qc_simulation/src/__init__.py b/bench/qc_simulation/src/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/bench/qc_simulation/src/__init__.py
@@ -0,0 +1 @@
+
diff --git a/bench/qc_simulation/src/circuit_gen/__init__.py b/bench/qc_simulation/src/circuit_gen/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bench/qc_simulation/src/circuit_gen/qaoa.py b/bench/qc_simulation/src/circuit_gen/qaoa.py
new file mode 100644
index 00000000..8dac5be6
--- /dev/null
+++ b/bench/qc_simulation/src/circuit_gen/qaoa.py
@@ -0,0 +1,82 @@
+import networkx
+import numpy as np
+from qtensor.tools import BETHE_QAOA_VALUES
+
+def generate_ibm_connectivity(arch):
+    """
+    Generate a connectivity graph from an IBM architecture
+
+    Args:
+        arch (str): one of ["eagle", "falcon"]
+    """
+    supported_archs = ["eagle", "falcon"]
+    if arch not in supported_archs:
+        raise ValueError("Architecture {} not supported".format(arch))
+
+    def coupling_map_from_provider(p_class):
+        p = p_class()
+        graph = p.coupling_map.graph.to_undirected()
+        elist = list(graph.edge_list())
+        G = networkx.from_edgelist(elist)
+        return G
+
+    if arch == "eagle":
+        # IBM quantum volume 64
+        from qiskit.providers.fake_provider import FakeWashingtonV2
+        return coupling_map_from_provider(FakeWashingtonV2)
+    if arch == "eagle":
+        # IBM quantum volume 64
+        from qiskit.providers.fake_provider import FakeCairoV2
+        return coupling_map_from_provider(FakeCairoV2)
+    else:
+        raise ValueError("IBM architecture {} not supported".format(arch))
+
+def save_terms_format(file, terms):
+    """
+    Save terms in a format that can be read by the qtensor simulator Takes a
+    list of terms in format `(coeff, [qubits])` and saves it to a file
+    """
+    import json
+    filename = file + '.jsonterms'
+    with open(filename, "w") as f:
+        json.dump(terms, f)
+    return filename
+
+def generate_graph(n, d, type="random"):
+    if type == "random":
+        return networkx.random_regular_graph(d, n)
+    elif type[:4] == "ibm_":
+        arch = type[4:]
+        return generate_ibm_connectivity(arch)
+    else:
+        raise ValueError("Unknown graph type {}".format(type))
+
+def generate_maxcut(out_file, N, p, d, graph_type='random', seed=None, parameters='random'):
+    """
+    Generate a random regular maxcut problem
+
+    Args:
+        out_file (str): Path to output file
+        N (int): Number of nodes
+        p (int): Number of layers
+        d (int): Random regular graph degree 
+        parameters (str): One of ["random", "fixed_angles"]
+
+    Returns:
+        str: Path to output file
+    """
+    G: networkx.Graph = generate_graph(N, d, graph_type)
+    terms = []
+    for u, v in G.edges:
+        terms.append((1, (u, v)))
+    if parameters == "random":
+        gamma = np.random.uniform(0, 2 * np.pi, p)
+        beta = np.random.uniform(0, np.pi, p)
+    elif parameters == "fixed_angles":
+        gammabeta = np.array(BETHE_QAOA_VALUES[str(p)]['angles'])
+        gamma, beta = gammabeta[:p]*2, gammabeta[p:]
+    else:
+        raise ValueError("Unknown parameters type {}. Use one of ['random', 'fixed_angles']".format(parameters))
+    pb = {"terms": terms, "gamma": gamma.tolist(), "beta": beta.tolist()}
+
+    return save_terms_format(out_file, pb)
diff --git a/bench/qc_simulation/src/simulators/qtensor.py b/bench/qc_simulation/src/simulators/qtensor.py
new file mode 100644
index 00000000..4836d67e
--- /dev/null
+++ b/bench/qc_simulation/src/simulators/qtensor.py
@@ -0,0 +1,351 @@
+import qtensor
+import qtree
+import numpy as np
+
+# -- QAOA generic parser
+
+class QAOAComposer(qtensor.DefaultQAOAComposer):
+    def __init__(self, N, terms, **kwargs):
+        self.n_qubits = N
+        # from ccomp (Can't call DefaultQAOA Composer since need graph)
+        self.builder = self._get_builder()
+        # gamma and beta
+        self.params = kwargs
+        # 
+        self.terms = terms
+        self.qubit_map = {n: i for i, n in enumerate(range(N))}
+
+    def cost_operator_circuit(self, gamma):
+        for factor, term in self.terms:
+            t_mapped = [self.qubit_map[i] for i in term]
+            self.append_Z_term(term, gamma)
+
+    def append_Z_term(self, term, gamma):
+        if len(term) == 2:
+            self.apply_gate(self.operators.ZZ, term[0], term[1], alpha=2*gamma)
+            #self.apply_gate(qtensor.OpFactory.ZZFull, term[0], term[1], alpha=2*gamma)
+        elif len(term) == 4:
+            self.apply_gate(self.operators.Z4, *term, alpha=2*gamma)
+        else:
+            raise ValueError(f"Invalid QAOA term length: {len(term)}")
+
+    def mixer_operator(self, beta):
+        qubits = self.qubit_map.values()
+        for qubit in qubits:
+            self.x_term(qubit, beta)
+
+def parse_qaoa(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    composer = QAOAComposer(N, terms, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa(b)
+
+    elif type == "qasm":
+        from qiskit import QuantumCircuit
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+
+        qiskit_circuit = QuantumCircuit.from_qasm_str(str)
+        return qtree.operators.from_qiskit_circuit(qiskit_circuit)
+    else:
+        b = circ_f.f.read()
+        str = b.decode('utf-8')
+        import io
+        f = io.StringIO(str)
+        N, circ = qtree.operators.read_circuit_stream(f)
+        return sum(circ, [])
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    circ = read_circ(in_file)
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        import networkx as nx
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    print("W", opt.treewidth)
+    # -- qtensor_estim
+    prep_data = (peo, par_vars, tn)
+    write_preps(prep_data, out_file)
+
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, S=0, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+        S: Offset of slice variables. If S=0, full slicing is used. If S=n last
+           n par_vars are omitted
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+    if S > 0:
+        par_vars = par_vars[:-S]
+        print("Offset par_vars", par_vars)
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             mpi=False,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    from qtensor.compression.Compressor import WriteToDiskCompressor
+    import cupy
+    cupy.cuda.profiler.start()
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+    
+    # -- Prepare backend
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor) 
+        elif compress == "cuszp":
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZPCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+
+    # -- simulate
+    start = time.time()
+    sim_result = _simulate_wrapper(backend, tn, peo, par_vars, hpc=mpi)
+
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    if mpi:
+        out_file += '_rank'+str(get_mpi_rank())
+    out_file += ".json"
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+
+    write_json(C, out_file)
+    cupy.cuda.profiler.stop()
+    return out_file
+
+def _simulate_wrapper(backend, tn, peo, par_vars, hpc=False):
+    """
+    Backend is modified in the simulation
+    """
+
+    # -- Prepare buckets
+    # --
+
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+    def make_sim():
+        import copy
+        sim = qtensor.QtreeSimulator(backend=backend)
+        sim.tn = copy.deepcopy(tn)
+        sim.tn.backend = backend
+        sim.peo = copy.deepcopy(peo)
+        return sim
+
+    if hpc:
+        res = _simulate_hpc(make_sim, par_vars)
+    else:
+        res = simulate_slice(make_sim, [0]*len(par_vars), par_vars)
+
+    return res
+
+def simulate_slice(make_sim, slice_values, par_vars):
+    from qtensor.contraction_algos import bucket_elimination
+    sim = make_sim()
+    tn = sim.tn
+    backend = sim.backend
+    if hasattr(backend, 'print'):
+        backend.print = False
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: int(v) for i,v in zip(par_vars, slice_values)}
+    print("Slice extents", slice_ext)
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    print(f"P {i}", end='', flush=True)
+    bcopy = [b[:] for b in buckets]
+    res = bucket_elimination(
+		bcopy, backend,
+		n_var_nosum=len(tn.free_vars)
+	)
+    del bcopy
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Result", sim_result)
+    try:
+        sim_result = sim_result.get()
+    except:
+        pass
+    return sim_result
+
+def _get_mpi_unit(sim, par_vars):
+    def _mpi_unit(rank):
+        slice_values = np.unravel_index(rank, [2]*len(par_vars))
+        res = simulate_slice(sim, slice_values, par_vars)
+        return res
+    return _mpi_unit
+
+def get_mpi_rank():
+    from qtensor.tools.lazy_import import MPI
+    w = MPI.COMM_WORLD
+    comm = MPI.Comm
+    rank = comm.Get_rank(w)
+    return rank
+
+def _simulate_hpc(_sim, par_vars):
+    from qtensor.contraction_algos import bucket_elimination
+    import cupy
+    from qtensor.tools.lazy_import import MPI
+    from qtensor.tools.mpi.mpi_map import MPIParallel
+    mpi_unit = _get_mpi_unit(_sim, par_vars)
+    par = MPIParallel()
+    w = MPI.COMM_WORLD
+    comm = MPI.Comm
+    size = comm.Get_size(w)
+    rank = comm.Get_rank(w)
+    cupy.cuda.runtime.setDevice(rank%4)
+    if rank==0:
+        print(f'MPI::I:: There are {size} workers and {2**len(par_vars)} tasks over {par_vars}')
+    if len(par_vars)==0:
+        return
+    values = par.map(mpi_unit, range(2**len(par_vars)))
+    return np.sum(values)
diff --git a/bench/qc_simulation/src/simulators/qtensor_energy.py b/bench/qc_simulation/src/simulators/qtensor_energy.py
new file mode 100644
index 00000000..18a8a2cf
--- /dev/null
+++ b/bench/qc_simulation/src/simulators/qtensor_energy.py
@@ -0,0 +1,277 @@
+import qtensor
+import qtree
+import networkx as nx
+import numpy as np
+
+# -- QAOA generic parser
+
+def parse_qaoa_composer(data):
+    import json
+    data = json.loads(data)
+    terms = data["terms"]
+    gamma = np.array(data["gamma"])/np.pi/2
+    beta = np.array(data["beta"])/np.pi
+    N = len(set(sum([t[1] for t in terms], [])))
+    G = nx.Graph()
+    for factor, term in terms:
+        G.add_edge(*term)
+    composer = qtensor.DefaultQAOAComposer(G, gamma=gamma, beta=beta)
+    return composer
+# --
+
+def read_circ(circ_f, type=None):
+
+    if type is None:
+        type = circ_f.path.name.split(".")[-1]
+
+    print("Reading circuit of type", type)
+    if type == "jsonterms":
+        b = circ_f.f.read()
+        return parse_qaoa_composer(b)
+
+    elif type == "qasm":
+        raise Exception("only jsonterms is supported for energy calculations")
+
+def read_preps(prep_f):
+    import pickle
+    return pickle.load(prep_f.f)
+
+def write_preps(peo, prep_f):
+    import pickle
+    pickle.dump(peo, open(prep_f, 'wb'))
+
+def write_json(data, out_file):
+    import json
+    with open(out_file, 'w') as f:
+        json.dump(data, f)
+        # This newline plays nice when cat-ing multiple files
+        f.write('\n')
+
+def preprocess_circ(circ, S, O, M, after_slice):
+    tn = qtensor.optimisation.QtreeTensorNet.from_qtree_gates(circ)
+    opt = qtensor.toolbox.get_ordering_algo(O)
+    if S:
+        # ignore argument type mismatch for pyright -- opt can be `Optimizer`
+        # pyright: reportGeneralTypeIssues=false
+        opt = qtensor.optimisation.TreeTrimSplitter(
+            tw_bias=0, max_tw=M, base_ordering=opt,
+            peo_after_slice_strategy=after_slice
+        )
+        
+        peo, par_vars, _ = opt.optimize(tn)
+        # --dbg
+        graph = tn.get_line_graph()
+        ignore_vars = tn.bra_vars + tn.ket_vars
+        for pv in par_vars:
+            graph.remove_node(int(pv))
+        components = list(nx.connected_components(graph))
+        print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+        print(f"peo size without par_vars and ignore_vars: {len(peo) - len(par_vars) - len(ignore_vars)}")
+
+        print()
+        # --
+    else:
+        peo, _ = opt.optimize(tn)
+        par_vars = []
+    #print("W", opt.treewidth)
+    return (peo, par_vars, tn), opt.treewidth
+
+def preprocess(in_file, out_file, O='greedy', S=None, M=30, after_slice='run-again'):
+    """
+    Arguments:
+        in_file: input file
+        out_file: output file
+        O: ordering algorithm 
+        S: slicing algorithm 
+        M: Memory limit for slicing 
+    """
+    import copy
+    composer = read_circ(in_file)
+    G = composer.graph
+    prep_data = []
+    for edge in G.edges:
+        c_copy = copy.deepcopy(composer)
+        c_copy.energy_expectation_lightcone(edge)
+        e_prep, treewidth = preprocess_circ(c_copy.circuit, S, O, M, after_slice)
+        if treewidth>25:
+            prep_data.append(e_prep)
+    write_preps(prep_data, out_file)
+    print(f"Wrote {len(prep_data)} preparations of lightcones")
+    return prep_data
+
+def estimate(in_file, out_file, C=100, M=30, F=1e12, T=1e9, **kwargs):
+    """
+    Arguments:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        C: Compression ratio
+        M: Memory limit in log2(b/16)
+        F: assumed FLOPS 
+        T: Throughput of compression
+    """
+    from qtensor.compression.cost_estimation import compressed_contraction_cost, Cost
+    from dataclasses import asdict
+    import json
+    prep_data = read_preps(in_file)
+    peo, par_vars, tn = prep_data
+
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo = peo[:len(peo) - len(par_vars)]
+    costs: list[Cost] = compressed_contraction_cost(tn, peo, mem_limit=M, compression_ratio=C)
+    totals: Cost = sum(costs[1:], costs[0])
+    time = totals.time(F, T, T, M)
+    C = asdict(totals)
+    C['time'] = time*2**len(par_vars)
+    C['slices'] = 2**len(par_vars)
+    print("C", C)
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+
+def simulate(in_file, out_file,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    import cupy
+    prep_data = read_preps(in_file)
+    cupy.cuda.profiler.start()
+
+    C = dict(
+        time=0,
+        elapsed=0,
+        memory=0,
+        memory_history=[],
+        nvmemory=0,
+        result = dict(Re=0, Im=0),
+        compression=dict(compress=[], decompress=[])
+    )
+
+    for prep_lightcone in prep_data[:5]:
+        print(prep_lightcone)
+        r = simulate_preps_lightcone(prep_lightcone, backend, compress, M,
+                                              r2r_error,
+                                              r2r_threshold,**kwargs)
+        C['time'] += r['time']
+        C['elapsed'] += r['elapsed']
+        C['memory'] = max(C['memory'], r['memory'])
+        C['nvmemory'] = max(C['nvmemory'], r['nvmemory'])
+        C['memory_history'] += r['memory_history']
+        C['result']['Re'] += r['result']['Re']
+        C['result']['Im'] += r['result']['Im']
+        if r.get('compression'):
+            C['compression']['compress'] += r['compression']['compress']
+            C['compression']['decompress'] += r['compression']['decompress']
+
+    out_file += ".json"
+    write_json(C, out_file)
+    return out_file
+    cupy.cuda.profiler.stop()
+
+def simulate_preps_lightcone(prep_data,
+             backend='einsum',
+             compress=None,
+             M=29,
+             r2r_error=1e-3, r2r_threshold=1e-3,
+             **kwargs):
+    """
+    Args:
+        in_file: file with preprocessed data
+        out_file: file to write the results to
+        backend: backend to use
+        compress: compression algorithm
+        M: memory threshold for compression
+        r2r_error: relative error for compression
+        r2r_threshold: relative threshold for compression
+    """
+    import time
+    from qtensor.contraction_algos import bucket_elimination
+    from qtensor.compression.Compressor import CUSZCompressor, CUSZXCompressor, TorchCompressor, NEWSZCompressor
+    #from qtensor.compression.Compressor import WriteToDiskCompressor
+    import cupy
+    peo, par_vars, tn = prep_data
+    
+    backend = qtensor.contraction_backends.get_backend(backend)
+    if compress is not None:
+        if compress == 'szx':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZXCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'cusz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = CUSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'torch':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = TorchCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'newsz':
+            print(f"{r2r_error=} {r2r_threshold=}")
+            compressor = NEWSZCompressor(r2r_error=r2r_error, r2r_threshold=r2r_threshold)
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        elif compress == 'disk':
+            compressor = WriteToDiskCompressor(f'/grand/QTensor/compression/data/tensors_compressed_M{M}/')
+            compressor = qtensor.compression.ProfileCompressor(compressor)
+        else:
+            raise ValueError(f"Unknown compression algorithm: {compress}")
+        backend = qtensor.contraction_backends.CompressionBackend(backend, compressor, M)
+        from qtensor.contraction_backends.performance_measurement_decorator import MemProfBackend
+        backend = MemProfBackend(backend)
+
+    relabelid = {}
+    for tensor in tn.tensors:
+        for i in tensor.indices:
+            relabelid[int(i)] = i
+
+    slice_ext = {relabelid[int(i)]: 0 for i in par_vars}
+
+    if len(par_vars) > 0:
+        print("Parvars", par_vars)
+        print(f"Detected {len(par_vars)} slice variables")
+    sim = qtensor.QtreeSimulator(backend=backend)
+    sim.tn = tn
+    sim.tn.backend = backend
+    sim.peo = peo
+    sim._slice_relabel_buckets(slice_ext)
+    buckets = sim.tn.buckets
+    # --dbg
+    #ignore_vars  = sim.tn.bra_vars + sim.tn.ket_vars 
+    #graph = qtree.graph_model.importers.buckets2graph(buckets, ignore_vars)
+    #graph, label_dict = qtree.graph_model.relabel_graph_nodes(
+        #graph, dict(zip(graph.nodes, np.array(list(graph.nodes)) - 127*2))
+    #) 
+    #import networkx as nx
+    #components = list(nx.connected_components(graph))
+    #print(f"Sliced graph # nodes: {graph.number_of_nodes()} and #components: {len(components)} with sizes {[len(c) for c in components]}")
+    #print(f"peo size without par_vars and ignore_vars: {len(peo) - len(ignore_vars)}")
+    # --
+
+    start = time.time()
+    for i in range(2**0):
+        print(f"P {i}", end='', flush=True)
+        bcopy = [b[:] for b in buckets]
+        res = bucket_elimination(
+            bcopy, backend,
+            n_var_nosum=len(tn.free_vars)
+        )
+        del bcopy
+        print("Result", res.data.flatten()[0])
+        #time.sleep(0.5)
+    sim_result = backend.get_result_data(res).flatten()[0]
+    print("Simulation result:", sim_result)
+    end = time.time()
+    print("Elapsed", end - start)
+    C = {'time': 2**len(par_vars)*(end - start)}
+    C['elapsed'] = (end - start)
+    C['memory'] = backend.max_mem
+    C['memory_history'] = backend.mem_history
+    C['nvmemory'] = backend.nvsmi_max_mem
+    C['result'] = {
+        "Re": np.real(sim_result).tolist(),
+        "Im": np.imag(sim_result).tolist()
+    }
+    if compress is not None:
+        if isinstance(compressor, qtensor.compression.ProfileCompressor):
+            C['compression'] = compressor.get_profile_data_json()
+    return C
diff --git a/bench/qc_simulation/qtensor/run.py b/bench/qc_simulation/src/simulators/qtensor_profile.py
similarity index 83%
rename from bench/qc_simulation/qtensor/run.py
rename to bench/qc_simulation/src/simulators/qtensor_profile.py
index 5a625dc2..e48e1774 100644
--- a/bench/qc_simulation/qtensor/run.py
+++ b/bench/qc_simulation/src/simulators/qtensor_profile.py
@@ -4,6 +4,7 @@
 from test_circuits import gen_qaoa_maxcut_circuit
 import qtensor
 import qtree
+from qtensor.merged_indices.transposed_bucket_elimination import bucket_elimination
 import numpy as np
 import pandas as pd
 import pyrofiler
@@ -14,7 +15,7 @@
 from qtensor.contraction_backends import get_backend, PerfBackend
 
 def bucket_contraction_report(tn, buckets, backend,
-                              bucket_elimination=qtree.optimizer.bucket_elimination
+                              bucket_elimination=bucket_elimination
                              ):
     """
     Returns:
@@ -29,7 +30,8 @@ def bucket_contraction_report(tn, buckets, backend,
         buckets, perf_backend.process_bucket,
         n_var_nosum=len(tn.free_vars)
     )
-    perf_backend.get_result_data(result).flatten()
+    result_data = perf_backend.get_result_data(result).flatten()
+    print("Result data:", result_data)
     # compute report_table
     rep_txt = perf_backend.gen_report(show=False)
     return perf_backend.report_table
@@ -42,15 +44,14 @@ def get_buckets_tn(circ, backend, ordering_algo:str, batch_vars=0, seed=10):
     sim.prepare_buckets(circ, batch_vars=batch_vars)
     return sim.buckets, tn
 
-
 '''
 Function: Generate a collection of above report, and process them into final usable form
 I/O: ... -> processed data is a dict, directly usable by json
 '''
-def collect_process_be_pt_report(repeat: int, backend, circ):
+def collect_process_be_pt_report(repeat: int, backend, circ, ordering_algo='greedy'):
     timing = pyrofiler.timing
     with timing(callback=lambda x: None) as gen:
-        buckets, tn = get_buckets_tn(circ, backend, 'rgreedy_0.02_10', batch_vars=0)
+        buckets, tn = get_buckets_tn(circ, backend, ordering_algo, batch_vars=0)
 
     tables = []
     wall_start = time.time()
@@ -71,8 +72,10 @@ def mean_mmax(x: list):
     return np.mean(x)
 
 def main():
-    Ns = [24, 26, 28, 30]
-    p = 3
+    Ns = [30]
+    p = 10
+    ordering_algo = 'greedy'
+    repeats = 2
     top_K = 15
     backend_name = 'torch_cpu'
     print("backend: ", backend_name)
@@ -80,7 +83,7 @@ def main():
         print(f"N={N}")
         backend = get_backend(backend_name)
         circ = gen_qaoa_maxcut_circuit(N, p)
-        report = collect_process_be_pt_report(9, backend, circ)
+        report = collect_process_be_pt_report(repeats, backend, circ, ordering_algo=ordering_algo)
 
         stats = report[["time"]].groupby('step').agg(['mean', 'min', 'max', 'std'])
         stats = pd.concat([
@@ -101,7 +104,7 @@ def main():
             report[["time"]].groupby('step').agg('mean'),
             report[["flop","FLOPS", 'result_size', 'bucket_len']].groupby('step').first()
         ], axis=1)
-        print(stats[['time', 'result_size', 'FLOPS']].groupby('result_size').agg(['mean', 'sum']))
+        print(stats[['time', 'result_size', 'FLOPS']].groupby('result_size').agg(['mean', 'sum', 'count']))
         print("Total time:")
         print(stats['time'].sum())
 
diff --git a/qtensor/FeynmanSimulator.py b/qtensor/FeynmanSimulator.py
index 6c2a02f1..34f03a91 100644
--- a/qtensor/FeynmanSimulator.py
+++ b/qtensor/FeynmanSimulator.py
@@ -163,7 +163,7 @@ def _parallel_unit(self, par_idx):
         self.merged_buckets = self.tn.buckets
         self.ibunch = self.ibunch
 
-        result = qtensor.merged_indices.bucket_elimination(
+        result = qtensor.contraction_algos.merged_bucket_elimination(
             self.tn.buckets,
             self.ibunch,
             self.backend.process_bucket_merged,
diff --git a/qtensor/MergedSimulator.py b/qtensor/MergedSimulator.py
index aa0d6818..34dee455 100644
--- a/qtensor/MergedSimulator.py
+++ b/qtensor/MergedSimulator.py
@@ -110,7 +110,7 @@ def simulate_batch(self, qc, batch_vars=0, peo=None, dry_run=False):
         if dry_run:
             return peo, max(width)
 
-        result = qtensor.merged_indices.bucket_elimination(
+        result = qtensor.contraction_algos.merged_bucket_elimination(
             self.tn.buckets,
             self.ibunch,
             self.backend.process_bucket_merged,
diff --git a/qtensor/Simulate.py b/qtensor/Simulate.py
index 0e271bd1..ec258e8b 100644
--- a/qtensor/Simulate.py
+++ b/qtensor/Simulate.py
@@ -1,6 +1,7 @@
 import qtree
 from qtensor.tools.lazy_import import cirq
 from qtensor.contraction_backends import NumpyBackend, ContractionBackend
+from qtensor.contraction_algos import bucket_elimination
 
 from qtensor.optimisation.TensorNet import QtreeTensorNet
 from qtensor.optimisation.Optimizer import DefaultOptimizer, Optimizer
@@ -118,15 +119,22 @@ def prepare_buckets(self, qc, batch_vars=0, peo=None):
                     raise ValueError(f'Treewidth {self.optimizer.treewidth} is larger than max_tw={self.max_tw}.')
         else:
             self.peo = peo
+        self._slice_relabel_buckets()
 
+    def _slice_relabel_buckets(self, slice_extension={}):
+        """
+        Relabels peo according to bucket indices.
+        Assumes self.tn and self.peo exists
+        """
         all_indices = sum([list(t.indices) for bucket in self.tn.buckets for t in bucket], [])
         identity_map = {int(v): v for v in all_indices}
         self.peo = [identity_map[int(i)] for i in self.peo]
 
 
-        self._reorder_buckets()
+        perm_dict = self._reorder_buckets()
         slice_dict = self._get_slice_dict()
-        #log.info('batch slice {}', slice_dict)
+        slice_extension = {perm_dict[k]: v for k, v in slice_extension.items()}
+        slice_dict.update(slice_extension)
 
         sliced_buckets = self.tn.slice(slice_dict)
         #self.backend.pbar.set_total ( len(sliced_buckets))
@@ -137,10 +145,8 @@ def prepare_buckets(self, qc, batch_vars=0, peo=None):
     def simulate_batch(self, qc, batch_vars=0, peo=None):
         self.prepare_buckets(qc, batch_vars, peo)
 
-        result = qtree.optimizer.bucket_elimination(
-            self.buckets, self.backend.process_bucket,
-            n_var_nosum=len(self.tn.free_vars)
-        )
+        result = bucket_elimination(self.buckets, self.backend,
+                                    n_var_nosum=len(self.tn.free_vars))
         return self.backend.get_result_data(result).flatten()
 
     def simulate(self, qc):
diff --git a/qtensor/__init__.py b/qtensor/__init__.py
index f30a7f7d..6d2be717 100644
--- a/qtensor/__init__.py
+++ b/qtensor/__init__.py
@@ -21,7 +21,7 @@
 from qtensor import simplify_circuit
 from qtensor.simplify_circuit import simplify_qtree_circuit
 from qtensor import optimisation
-from qtensor import merged_indices
+from qtensor import contraction_algos
 from qtensor import problems
 from qtensor import MergedSimulator
 from qtensor import tools
diff --git a/qtensor/compression/CompressedTensor.py b/qtensor/compression/CompressedTensor.py
new file mode 100644
index 00000000..dfe9d2a8
--- /dev/null
+++ b/qtensor/compression/CompressedTensor.py
@@ -0,0 +1,167 @@
+import itertools
+import numpy as np
+from qtree.optimizer import Tensor
+from qtree.system_defs import NP_ARRAY_TYPE
+from .Compressor import NumpyCompressor, Compressor
+
+
+def iterate_indices(indices: list):
+    if len(indices) == 0:
+        return [tuple()]
+    ranges = [range(v.size) for v in indices]
+    return itertools.product(*ranges)
+
+
+class CompressedTensor(Tensor):
+    """
+    Extension of the Tensor class that holds compressed data
+
+    The data array is split along several indices S into 2^|S| parts
+
+    """
+
+    def __init__(
+        self,
+        name,
+        indices,
+        data_key=None,
+        data=None,
+        slice_indices=[],
+        compressor: Compressor = NumpyCompressor(),
+    ):
+        """
+        Initialize the tensor
+        name: str,
+              the name of the tensor. Used only for display/convenience.
+              May be not unique.
+        indices: tuple,
+              Indices of the tensor
+        shape: tuple,
+              shape of a tensor
+        data_key: int
+              Key to find tensor's data in the global storage
+        data: np.array
+              Actual data of the tensor. Default None.
+              Usually is not supplied at initialization.
+        slice_indices: list[Var]
+            indices along which the tensor is split into chunks
+        """
+        super().__init__(name, indices, data_key=data_key, data=data)
+        self.slice_indices = slice_indices
+        self.compressor = compressor
+        if data is not None:
+            self._dtype = data.dtype
+        else:
+            self._dtype = None
+
+    @classmethod
+    def empty(
+        cls,
+        name,
+        indices,
+        slice_indices=[],
+        compressor=NumpyCompressor(),
+        dtype: type = NP_ARRAY_TYPE,
+    ):
+        t = super().empty(name, indices, dtype)
+        t.compressor = compressor
+        if slice_indices:
+            t.compress_indices(slice_indices)
+        return t
+
+    def compress_indices(self, indices: list):
+        """
+        Slice the self.data along dimensions in `indices`,
+        store them compressed
+
+        Does not support compressing when already compressed
+        """
+        slice_dict = {i: slice(None) for i in self.indices}
+        data_chunks = []
+        for ivals in iterate_indices(indices):
+            for ix, ival in zip(indices, ivals):
+                slice_dict[ix] = ival  # slice(ival, ival+1)
+            dslice = self.data[tuple(slice_dict[i] for i in self.indices)]
+
+            data_chunks.append(self.compressor.compress(dslice))
+            del dslice
+        self._data = data_chunks
+        self.slice_indices = indices
+
+    @property
+    def dtype(self):
+        """
+        DataType of wrapped chunks.
+        """
+        return self._dtype
+
+    @property
+    def array_indices(self):
+        return [x for x in self.indices if x not in self.slice_indices]
+
+    def get_chunk(self, ivals):
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals) == 0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        ptr = self._data[flat_ix]
+        return self.compressor.decompress(ptr)
+
+    def set_chunk(self, ivals, chunk: np.ndarray):
+        # -- Check for consistent data types between chunks
+        if self._dtype is None:
+            self._dtype = chunk.dtype
+        else:
+            assert (
+                self.dtype == chunk.dtype
+            ), f"Chunk dtype {chunk.dtype} does not match tensor dtype {self.dtype}"
+        # --
+
+        if self._data is None:
+            self._data = np.empty(2 ** len(self.slice_indices), dtype=object)
+        dims = [v.size for v in self.slice_indices]
+        if len(ivals) == 0:
+            flat_ix = 0
+        else:
+            flat_ix = np.ravel_multi_index(ivals, dims)
+        self._data[flat_ix] = self.compressor.compress(chunk)
+
+    def __getitem__(self, key):
+        """
+        Get a slice of the tensor along the indices in `key`
+        Currently slicing over all compressed indices is required.
+        Slices over compressed indices must be ints
+        """
+        slices_ints, new_indices = self._parse_getitem_key(key)
+        slice_dict = {}
+        chunk_slices_ints = []
+        compression_ints = []
+        for ix, ival in zip(self.indices, slices_ints):
+            slice_dict[ix] = ival
+            if ix in self.slice_indices:
+                compression_ints.append(ival)
+            else:
+                chunk_slices_ints.append(ival)
+        chunk = self.get_chunk(compression_ints)
+        new_name = f"{self.name}[sliced]"
+        # careful: chunk will not be collected even if slice is small
+        chunk_slice = chunk[tuple(chunk_slices_ints)]
+        return Tensor(new_name, new_indices, data=chunk_slice)
+
+    def __str__(self):
+        array_ix = ",".join(map(str, self.array_indices))
+        split_ix = ",".join(map(str, self.slice_indices))
+        return f"{self._name}{{{split_ix}}}({array_ix})"
+
+    def copy(self, name=None, indices=None, data_key=None, data=None):
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __del__(self):
+        if self._data is not None:
+            for chunk in self._data:
+                self.compressor.free_compressed(chunk)
+        del self
diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py
new file mode 100644
index 00000000..75db40ab
--- /dev/null
+++ b/qtensor/compression/Compressor.py
@@ -0,0 +1,640 @@
+import io
+import sys
+import numpy as np
+from pathlib import Path
+print(Path(__file__).parent/'szx/src/')
+sys.path.append(str(Path(__file__).parent/'szx/src/'))
+sys.path.append('./szx/src')
+sys.path.append(str(Path(__file__).parent/'szp/src/'))
+sys.path.append('./szp/src')
+
+sys.path.append(str(Path(__file__).parent/'cusz/src'))
+sys.path.append('./cusz/src')
+sys.path.append(str(Path(__file__).parent/'torch_quant'))
+sys.path.append('./torch_quant')
+sys.path.append(str(Path(__file__).parent/'newsz'))
+sys.path.append('./newsz')
+
+
+import torch
+try:
+    import cuszp
+    from cuszx_wrapper import cuszx_host_compress, cuszx_host_decompress, cuszx_device_compress, cuszx_device_decompress
+    from cuSZp_wrapper import cuszp_device_compress, cuszp_device_decompress
+    from cusz_wrapper import cusz_device_compress, cusz_device_decompress
+    from torch_quant_perchannel import quant_device_compress, quant_device_decompress
+    from newsz_wrapper import newsz_device_compress, newsz_device_decompress
+except:
+    print("import failed")
+    # Silently fail on missing build of cuszx
+    pass
+
+CUSZX_BLOCKSIZE = 256
+
+# -- helper functions
+
+def _get_data_info(data):
+    import cupy
+    if isinstance(data, cupy.ndarray):
+        isCuPy = True
+    else:
+        isCuPy = False
+    num_elements = data.size
+    # Adapt numele depending on itemsize
+    itemsize = data.dtype.itemsize
+    num_elements_eff = int(num_elements*itemsize/4) 
+    return isCuPy, num_elements_eff
+
+# -- Compressor classes
+
+class Compressor():
+    def compress(self, data):
+        raise NotImplementedError
+
+    def decompress(self, ptr):
+        raise NotImplementedError
+
+    def compress_size(self, ptr):
+        return ptr.nbytes
+
+# -- Debugging and profiling
+
+import time
+from dataclasses import dataclass, asdict
+@dataclass
+class CompressMeasure:
+    time: float = 0
+    size_in: int = 0
+    size_out: int = 0
+    label: str = ''
+
+    def __str__(self):
+        compress_ratio = self.size_in / self.size_out
+        return (f'Measure: {self.time:.3f}s, '
+                f'{self.size_in/1024**2:.2f}MB -> {self.size_out/1024**2:.2f}MB ({compress_ratio:.3f} in/out ratio)'
+        )
+
+class ProfileCompressor(Compressor):
+    def __init__(self, compressor:Compressor, trace=True):
+        self.trace = trace
+        self.compressor = compressor
+        self.profile_data = {'compress': [], 'decompress': []}
+
+    def compress(self, data):
+        start = time.time()
+        ptr = self.compressor.compress(data)
+        end = time.time()
+        out_size = self.compressor.compress_size(ptr)
+        cmeasure = CompressMeasure(end-start, data.nbytes, out_size)
+        self.profile_data['compress'].append(cmeasure)
+        if self.trace:
+            print(f'Compress: {cmeasure}')
+        return ptr
+
+    def decompress(self, ptr):
+        start = time.time()
+        data = self.compressor.decompress(ptr)
+        end = time.time()
+        in_size = self.compressor.compress_size(ptr)
+        dmeasure = CompressMeasure(end-start, in_size, data.nbytes)
+        self.profile_data['decompress'].append(dmeasure)
+        if self.trace:
+            print(f'Decompress: {dmeasure}')
+        return data
+
+    def get_profile_data(self):
+        return self.profile_data['compress'], self.profile_data['decompress']
+
+    def get_profile_data_json(self):
+        compress, decompress = self.get_profile_data()
+        return {
+            'compress': [asdict(c) for c in compress],
+            'decompress': [asdict(c) for c in decompress],
+        }
+
+    def get_profile_stats(self):
+        compress, decompress = self.get_profile_data()
+        compress_time = sum([x.time for x in compress])
+        decompress_time = sum([x.time for x in decompress])
+        compress_ratios = np.mean([x.size_in/x.size_out for x in compress])
+        compress_size = sum([x.size_out for x in compress])
+        return compress_time, decompress_time, compress_size, compress_ratios
+
+    def compress_size(self, ptr):
+        return self.compressor.compress_size(ptr)
+    
+    def free_decompressed(self):
+        self.compressor.free_decompressed()
+    
+    def free_compressed(self, ptr):
+        self.compressor.free_compressed(ptr)
+# --
+
+class NumpyCompressor(Compressor):
+    def compress(self, data):
+        comp = io.BytesIO()
+        np.savez_compressed(comp, data)
+        return comp
+
+    def compress_size(self, ptr):
+        return ptr.getbuffer().nbytes
+
+    def decompress(self, ptr):
+        ptr.seek(0)
+        return  np.load(ptr)['arr_0']
+
+    def free_compressed(self, ptr):
+        del ptr
+        return
+
+    def free_decompressed(self):
+        return
+
+class CUSZPCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Decompressed data Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            cupy.cuda.runtime.free(x)
+            # del x
+            # need to run this for every x?
+            cupy.get_default_memory_pool().free_all_blocks()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+        self.decompressed_own = []
+        #cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+        #self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        #return
+        import ctypes, cupy
+        #cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        cmp_t, shape, dtype = ptr
+        del cmp_t
+        torch.cuda.empty_cache()
+        return 
+        print(f"Freeing compressed data {num_elements_eff}")
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
+        #del cmp_bytes
+
+    def compress(self, data):
+        isCupy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        shape = data.shape
+        # convert cupy to torch
+        # TODO: cast to one array of double the number of elements
+        torch_data = torch.tensor(data, device='cuda')
+        data_view = torch.view_as_real(torch_data)
+        #print(f"cuszp Compressing {type(data)}")
+        #cmp_bytes, outSize_ptr = cuszp_device_compress(data, self.r2r_error, num_elements_eff, self.r2r_threshold)
+        cmp_t = cuszp.compress(data_view, self.r2r_error, 'rel')
+        return (cmp_t, shape, dtype)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+    def compress_size(self, ptr):
+        #return ptr[4]
+        return ptr[0].nbytes
+
+    def decompress(self, obj):
+        import cupy
+        #cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
+        #decompressed_ptr = cuszp_device_decompress(num_elements_eff, cmp_bytes, cmpsize, self, dtype)
+        cmp_t, shape, dtype = obj
+        num_elements_decompressed = 1
+        for s in shape:
+            num_elements_decompressed *= s
+        # Number of elements is twice because the shape is for complex numbers
+        num_elements_decompressed *= 2
+        decomp_t_float = cuszp.decompress(cmp_t, num_elements_decompressed, cmp_t.nbytes, self.r2r_error, 'rel')
+        decomp_t_float = decomp_t_float.view(decomp_t_float.shape[0]//2, 2)
+        decomp_t = torch.view_as_complex(decomp_t_float)
+        arr_cp = cupy.asarray(decomp_t)
+        arr = cupy.reshape(arr_cp, shape)
+        return arr
+        arr_cp = decompressed_ptr[0]
+
+        # Cupy memory management might not deallocate memory properly
+        #arr = cupy.reshape(arr_cp, shape)
+        #self.decompressed_own.append(arr)
+        # Use pointer instead, as in cuszx
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        arr = cupy.reshape(arr_cp, shape)
+        return arr
+
+class TorchCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            del x
+        cupy.get_default_memory_pool().free_all_blocks()
+        cupy.get_default_pinned_memory_pool().free_all_blocks()
+        torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        cmp_bytes, num_elements_eff, shape, dtype, _ = ptr
+        del cmp_bytes
+
+    def compress(self, data):
+        isCupy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements_eff, CUSZX_BLOCKSIZE, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, data.shape, dtype, outSize_ptr)
+
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[4]
+
+    def decompress(self, obj):
+        import cupy
+        cmp_bytes, num_elements_eff, shape, dtype, cmpsize = obj
+        decompressed_ptr = quant_device_decompress(num_elements_eff, cmp_bytes, self, dtype)
+        arr_cp = decompressed_ptr[0]
+
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        return arr
+    
+
+class NEWSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            print('Before compress')
+            cmp_bytes, outSize_ptr = newsz_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            print('After compress')
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = newsz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class CUSZXCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            #print(x)
+            #if x == None:
+            #    continue
+            #else:
+                #print("CUDA Free", x)
+            cupy.cuda.runtime.free(x)
+            # del x
+            cupy.get_default_memory_pool().free_all_blocks()
+            #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+        cupy.get_default_memory_pool().free_all_blocks()
+        #cupy.get_default_pinned_memory_pool().free_all_blocks()
+        #torch.cuda.empty_cache()
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        # return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr)
+
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        # self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        """
+        ## Compression API with cuSZx ###
+        Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+         - num_elements = Number of floating point elements in data
+         - r2r_error = relative-to-value-range error bound for lossy compression
+         - r2r_threshold = relative-to-value-range threshold to floor values to zero
+         Returns:
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+         """
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            #cmp_bytes, outSize_ptr = cuszp_device_compress(data, r2r_error, num_elements,  r2r_threshold)
+            cmp_bytes, outSize_ptr = cuszx_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        """
+        ## Decompression API with cuSZx ###
+         Parameters:
+         - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+         - cmp_bytes = Unsigned char pointer to compressed bytes
+         - num_elements = Number of floating point elements in original data
+         Returns:
+         - decompressed_data = Float32 pointer to decompressed data
+        
+         Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+         """
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cuszx_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+    
+class CUSZCompressor(Compressor):
+    def __init__(self, r2r_error=1e-3, r2r_threshold=1e-3):
+        self.r2r_error = r2r_error
+        self.r2r_threshold = r2r_threshold
+        self.decompressed_own = []
+
+    def free_decompressed(self):
+        import cupy
+        print("Cleanup", len(self.decompressed_own))
+        for x in self.decompressed_own:
+            cupy.cuda.runtime.free(x)
+            # del x
+            # cupy.get_default_memory_pool().free_all_blocks()
+            # cupy.get_default_pinned_memory_pool().free_all_blocks()
+        # torch.cuda.empty_cache()
+        self.decompressed_own = []
+
+    def free_compressed(self, ptr):
+        import ctypes, cupy
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, _ = ptr
+        p_decompressed_ptr = ctypes.addressof(cmp_bytes[0])
+        # cast to int64 pointer
+        # (effectively converting pointer to pointer to addr to pointer to int64)
+        p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        decompressed_int = p_decompressed_int.contents
+        cupy.cuda.runtime.free(decompressed_int.value)
+
+    def compress(self, data):
+        isCuPy, num_elements_eff = _get_data_info(data)
+
+        dtype = data.dtype
+        cmp_bytes, outSize_ptr = self.cuszx_compress(isCuPy, data, num_elements_eff, self.r2r_error, self.r2r_threshold)
+        return (cmp_bytes, num_elements_eff, isCuPy, data.shape, dtype, outSize_ptr.contents.value)
+
+    def compress_size(self, ptr):
+        return ptr[5]
+
+    def decompress(self, obj):
+        import cupy
+        import ctypes
+        cmp_bytes, num_elements_eff, isCuPy, shape, dtype, cmpsize = obj
+        decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, cmpsize, num_elements_eff, self, dtype)
+        arr_cp = decompressed_ptr[0]
+        #self.decompressed_own.append(decompressed_ptr[1])
+        
+        # -- Workaround to convert GPU pointer to int
+        # p_decompressed_ptr = ctypes.addressof(decompressed_ptr)
+        # # cast to int64 pointer
+        # # (effectively converting pointer to pointer to addr to pointer to int64)
+        # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+        # decompressed_int = p_decompressed_int.contents
+        # # --
+        # self.decompressed_own.append(decompressed_int.value)
+        # mem = cupy.cuda.UnownedMemory(decompressed_int.value, num_elements_eff, self, device_id=0)
+        # mem_ptr = cupy.cuda.memory.MemoryPointer(mem, 0)
+        arr = cupy.reshape(arr_cp, shape)
+        self.decompressed_own.append(arr)
+        # arr = cupy.ndarray(shape, dtype=dtype, memptr=mem_ptr)
+        return arr
+    
+    ### Compression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - data = Numpy or Cupy ndarray, assumed to be 1-D, np.float32 type
+    # - num_elements = Number of floating point elements in data
+    # - r2r_error = relative-to-value-range error bound for lossy compression
+    # - r2r_threshold = relative-to-value-range threshold to floor values to zero
+    # Returns:
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - outSize_ptr = Pointer to size_t representing length in bytes of cmp_bytes
+    def cuszx_compress(self, isCuPy, data, num_elements, r2r_error, r2r_threshold):
+        
+        if not isCuPy:
+            cmp_bytes, outSize_ptr = cuszx_host_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+        else:
+            cmp_bytes, outSize_ptr = cusz_device_compress(data, r2r_error, num_elements, CUSZX_BLOCKSIZE,r2r_threshold)
+            # cmp_bytes, outSize_ptr = quant_device_compress(data, num_elements, CUSZX_BLOCKSIZE, r2r_threshold)
+            del data
+            torch.cuda.empty_cache()
+        return cmp_bytes, outSize_ptr
+
+    ### Decompression API with cuSZx ###
+    # Parameters:
+    # - isCuPy = boolean, true if data is CuPy array, otherwise is numpy array
+    # - cmp_bytes = Unsigned char pointer to compressed bytes
+    # - num_elements = Number of floating point elements in original data
+    # Returns:
+    # - decompressed_data = Float32 pointer to decompressed data
+    #
+    # Notes: Use ctypes to cast decompressed data to Numpy or CuPy type
+
+    def cuszx_decompress(self, isCuPy, cmp_bytes, cmpsize, num_elements, owner, dtype):
+        if not isCuPy:
+            decompressed_data = cuszx_host_decompress(num_elements, cmp_bytes)
+        else:
+            # cuszx_device_decompress(nbEle, cmpBytes, owner, dtype)
+            decompressed_data = cusz_device_decompress(num_elements, cmp_bytes, owner,dtype)
+# oriData, absErrBound, nbEle, blockSize,threshold
+            # decompressed_data = quant_device_decompress(num_elements, cmp_bytes, owner,dtype)
+        return decompressed_data
+
+class WriteToDiskCompressor(Compressor):
+    def __init__(self, path):
+        from pathlib import Path
+        Path(path).mkdir(exist_ok=True, parents=True)
+        self.path = path
+    
+    def _gen_random_filename(self, info):
+        dtype, shape, isCupy = info
+        k = np.random.randint(0, 100000000)
+        s = hex(k)[2:]
+        return self.path + f'/qtensor_data_{s}_{str(dtype)}.bin'
+
+    def compress(self, data):
+        import cupy
+        if isinstance(data, cupy.ndarray):
+            isCupy=False
+        else:
+            isCupy=True
+        fname = self._gen_random_filename((data.dtype, data.shape, isCupy))
+        data.tofile(fname)
+        return (fname, data.dtype, data.shape, isCupy)
+
+    def compress_size(self, ptr):
+        return 0.1
+
+    def decompress(self, obj):
+        import cupy
+        fname, dtype, shape, isCupy = obj
+        if isCupy:
+            return cupy.fromfile(fname).view(dtype).reshape(shape)
+        else:
+            return np.fromfile(fname).view(dtype).reshape(shape)
+
+    def free_compressed(self, ptr):
+        pass
+    def free_decompressed(self):
+        pass
diff --git a/qtensor/compression/__init__.py b/qtensor/compression/__init__.py
new file mode 100644
index 00000000..7a69a45c
--- /dev/null
+++ b/qtensor/compression/__init__.py
@@ -0,0 +1,14 @@
+from .Compressor import (
+    Compressor,
+    NumpyCompressor,
+    CUSZCompressor,
+    CUSZXCompressor,
+    ProfileCompressor,
+    CUSZPCompressor,
+    TorchCompressor,
+)
+from .CompressedTensor import CompressedTensor, Tensor
+from .compressed_contraction import compressed_contract, compressed_sum
+from .cost_estimation import compressed_contraction_cost
+
+
diff --git a/qtensor/compression/compressed_contraction.py b/qtensor/compression/compressed_contraction.py
new file mode 100644
index 00000000..1023021c
--- /dev/null
+++ b/qtensor/compression/compressed_contraction.py
@@ -0,0 +1,207 @@
+import numpy as np
+
+from qtensor.compression import CompressedTensor
+from .CompressedTensor import Tensor, iterate_indices
+from .CompressedTensor import Compressor
+
+# taken from numpy/core/einsumfunc.py
+einsum_symbols = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+einsum_symbols_set = set(einsum_symbols)
+
+def contract_two_tensors(A, B, T_out, einsum=np.einsum):
+    """
+    Contract tensors A and B along their common indices and write result to T_out.
+    T_out tensor should be pre-allocated with data.
+
+    This takes care of the case where indices of A and B are Vars with large integer id
+    """
+    result_indices = T_out.indices
+    out_buffer = T_out.data
+    max_id = 0
+    A_ints = []
+    B_ints = []
+
+    for a_i in A.indices:
+        a_int = int(a_i)
+        max_id = max(max_id, a_int)
+        A_ints.append(a_int)
+
+    for b_i in B.indices:
+        b_int = int(b_i)
+        max_id = max(max_id, b_int)
+        B_ints.append(b_int)
+
+    if max_id >= len(einsum_symbols):
+        # -- relabel indices to small ints
+        all_indices = set(A_ints + B_ints)
+        relabel_dict_int = {i: j for j, i in enumerate(all_indices)}
+        A_ints = [relabel_dict_int[i] for i in A_ints]
+        B_ints = [relabel_dict_int[i] for i in B_ints]
+        result_ints = [relabel_dict_int[int(i)] for i in result_indices]
+    else:
+        result_ints = list(map(int, result_indices))
+    #print(A.data.shape)
+    #print(B.data.shape)
+    out = einsum(A.data, A_ints, B.data, B_ints, result_ints)
+    if len(result_ints)>0:
+        # This copying is reqiured because cupy doesn't support `out` argument.
+        out_buffer[:] = out
+    else:
+        out_buffer.fill(out)
+
+
+def compressed_contract(A:Tensor, B: Tensor,
+                        contract_ixs, mem_limit,
+                        compressor:Compressor,
+                        # These two functions are used to support many backends
+                        einsum=np.einsum,
+                        move_data=lambda x: x
+                       ):
+    """
+    Contract tensors A and B along `contract_ixs` and return the result
+
+    The result tensor indices will be ordered from largest to smallest
+    """
+    all_indices = list(set(A.indices).union(B.indices))
+    all_indices.sort(key=int, reverse=True)
+    result_indices = list(set(all_indices) - set(contract_ixs))
+    result_indices.sort(key=int, reverse=True)
+
+    # -- Find set of existing compressed that will be decompressed
+    exist_compressed = set()
+    for T in [A, B]:
+        if isinstance(T, CompressedTensor):
+            exist_compressed.update(T.slice_indices)
+    # In this particular case, we need not to sort these indices,
+    # since the iteration over fast index gives same latency as over slow index
+    # Potential improvement: if A_S and B_S are different, run outer loop 
+    # over min(A_S, B_S) and inner over the rest indices. This will reduce 
+    # the number of decompressions.
+    # --
+
+
+    need_compressed = result_indices[:-mem_limit]
+    new_tensor_name = 'C'+str(int(all_indices[-1]))
+
+    # -- Early return: if no need to compress, do the regular contraction
+    if len(need_compressed)==0 and len(exist_compressed)==0:
+        C = Tensor.empty(new_tensor_name, result_indices)
+        C.data = move_data(C.data)
+        contract_two_tensors(A, B, C, einsum=einsum)
+        return C
+    # --
+    print(f"Need compression: {need_compressed}")
+
+    remove_compress = exist_compressed - set(need_compressed)
+    R = CompressedTensor(new_tensor_name,
+                         result_indices,
+                         slice_indices=need_compressed,
+                         compressor=compressor
+                        )
+
+    result_chunk_ixs = result_indices[-mem_limit:]
+    print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
+    slice_dict = {}
+    chunk = np.empty(2**len(result_chunk_ixs), dtype=B.dtype)
+    chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+    chunk = move_data(chunk)
+    for r_i in iterate_indices(need_compressed):
+        for ix, sl in zip(need_compressed, r_i):
+            slice_dict[ix] = sl
+        for irm in iterate_indices(remove_compress):
+            for i, ival in zip(remove_compress, irm):
+                slice_dict[i] = ival#slice(ival, ival+1)
+            chunk_view = chunk[tuple(
+                slice_dict.get(i, slice(None)) for i in result_chunk_ixs
+            )]
+            A_slice = A[slice_dict]
+            B_slice = B[slice_dict]
+
+            C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
+            C = Tensor('tmp', indices=C_ixs, data=chunk_view)
+            contract_two_tensors(A_slice, B_slice, C, einsum=einsum)
+            # Free temp slices
+            #import cupy
+            #print("Flags", A_slice.data.flags, B_slice.data.flags, C.data.flags)
+            #cupy.cuda.runtime.free(A_slice.data.data.ptr)
+            #cupy.cuda.runtime.free(B_slice.data.data.ptr)
+            compressor.free_decompressed()
+        if len(need_compressed)==0:
+            R = Tensor(new_tensor_name, result_indices, data=chunk)
+        else:
+            R.set_chunk(r_i, chunk)
+    print('Return', R)
+    return R
+
+def compressed_sum(A:Tensor, sum_ixs,
+                   compressor:Compressor,
+                   mem_limit,
+                   # These two functions are used to support many backends
+                   einsum=np.einsum,
+                   move_data=lambda x: x
+                  ):
+    """
+    The result tensor indices will be ordered from largest to smallest
+    """
+    all_indices = list(set(A.indices))
+    all_indices.sort(key=int, reverse=True)
+    result_indices = list(set(all_indices) - set(sum_ixs))
+    result_indices.sort(key=int, reverse=True)
+
+    # -- Find set of existing compressed that will be decompressed
+    exist_compressed = set()
+    if isinstance(A, CompressedTensor):
+        exist_compressed.update(A.slice_indices)
+    # In this particular case, we need not to sort these indices,
+    # since the iteration over fast index gives same latency as over slow index
+    # Potential improvement: if A_S and B_S are different, run outer loop 
+    # over min(A_S, B_S) and inner over the rest indices. This will reduce 
+    # the number of decompressions.
+    # --
+
+    need_compressed = result_indices[:-mem_limit]
+    new_tensor_name = 'C'+str(int(all_indices[-1]))
+
+    # -- Early return: if no need to compress, do the regular contraction
+    if len(need_compressed)==0 and len(exist_compressed)==0:
+        C = Tensor.empty(new_tensor_name, result_indices)
+        sum_axes = tuple([A.indices.index(i) for i in sum_ixs])
+        C.data = A.data.sum(axis=sum_axes)
+        return C
+    # --
+    print(f"Need compression: {need_compressed}")
+
+    remove_compress = exist_compressed - set(need_compressed)
+    R = CompressedTensor(new_tensor_name,
+                         result_indices,
+                         slice_indices=need_compressed,
+                         compressor=compressor
+                        )
+
+    result_chunk_ixs = result_indices[-mem_limit:]
+    print(f"Chunk indices: {result_chunk_ixs}, remove_compress: {remove_compress}")
+    slice_dict = {}
+    chunk = np.empty(2**len(result_chunk_ixs), dtype=A.dtype)
+    chunk = chunk.reshape(*(v.size for v in result_chunk_ixs))
+    chunk = move_data(chunk)
+    for r_i in iterate_indices(need_compressed):
+        for ix, sl in zip(need_compressed, r_i):
+            slice_dict[ix] = sl
+        for irm in iterate_indices(remove_compress):
+            for i, ival in zip(remove_compress, irm):
+                slice_dict[i] = ival#slice(ival, ival+1)
+            chunk_view = chunk[tuple(
+                slice_dict.get(i, slice(None)) for i in result_chunk_ixs
+            )]
+            A_slice = A[slice_dict]
+            sum_axes = [A_slice.indices.index(i) for i in sum_ixs]
+
+            C_ixs = [v for v in result_chunk_ixs if v not in exist_compressed]
+            C = Tensor('tmp', indices=C_ixs, data=chunk_view)
+            chunk_view[:] = A_slice.data.sum(axis=tuple(sum_axes))
+        if len(need_compressed)==0:
+            R = Tensor(new_tensor_name, result_indices, data=chunk)
+        else:
+            R.set_chunk(r_i, chunk)
+        compressor.free_decompressed()
+    return R
diff --git a/qtensor/compression/cost_estimation.py b/qtensor/compression/cost_estimation.py
new file mode 100644
index 00000000..a8e9ea8d
--- /dev/null
+++ b/qtensor/compression/cost_estimation.py
@@ -0,0 +1,236 @@
+from dataclasses import dataclass
+from functools import reduce
+import numpy as np
+from qtensor.optimisation import QtreeTensorNet
+from typing import Iterable, Hashable, Dict
+
+Edge = Iterable[Hashable]
+Hypergraph = Dict[Hashable, Edge]
+# # self = hypergraph
+# verts = set(sum(self.values(), []))
+# num_edges = len(self)
+
+@dataclass
+class Cost:
+    use_log = True
+    flops: int
+    memory: int
+    width: int
+    compressions: int
+    decompressions: int
+
+    def time(self, flops_second, compression_throughput, decompression_throughput, memory_limit):
+        """Returns the time in seconds to perform the contraction"""
+        return (
+            self.flops / flops_second
+            + self.compressions *2**memory_limit/ compression_throughput
+            + self.decompressions *2**memory_limit/ decompression_throughput
+        )
+
+    def __add__(self, other):
+        return Cost(
+            self.flops + other.flops,
+            max(self.memory, other.memory),
+            max(self.width, other.width),
+            self.compressions + other.compressions,
+            self.decompressions + other.decompressions,
+        )
+
+    def format_number(self, n):
+        if self.use_log:
+            # log from ints may result in error
+            return f"{np.log2(n*1.):.2f}"
+        else:
+            return f"{n}"
+
+    def __str__(self):
+        flops_str = self.format_number(self.flops)
+        mems_str = self.format_number(self.memory)
+        return f"Cost(FLOPs={flops_str}, Memory={mems_str}, width={self.width}, compressions={self.compressions}, decompressions={self.decompressions})"
+
+def dual_hg(hg: Hypergraph) -> Hypergraph:
+    dual = {}
+    for iedge, edge in hg.items():
+        for vert in edge:
+            if dual.get(vert) is None:
+                dual[vert] = []
+            dual[vert].append(iedge)
+    return dual
+
+def remove_vertices_tensors(TN, dual_TN, vertices=[], tensors=[]):
+    for t in tensors:
+        # -- remove tensor
+        for v in dual_TN[t]:
+            TN[v].remove(t)
+        del dual_TN[t]
+
+    for vertex in vertices:
+        # remove vertex
+        for t in TN[vertex]:
+            dual_TN[t].remove(vertex)
+        del TN[vertex]
+
+def tn2tn(tn: QtreeTensorNet, peo=None): 
+    ignored_vars = list(map(int, tn.bra_vars + tn.ket_vars))
+    # Vertices --> indices
+    # Edges --> tensors
+    dual_tn = { str(hex(id(t))):[x for x in t.indices if int(x) not in ignored_vars and x.size>1]
+               for t in tn.tensors }
+    # clean up empty edges
+    for t in list(dual_tn.keys()):
+        if len(dual_tn[t]) == 0:
+            del dual_tn[t]
+
+    # Vertices --> tensors
+    # Edges --> indices
+    TN = dual_hg(dual_tn)
+    return TN
+
+def tensor_memory(indices, mem_limit, compression_ratio):
+    if len(indices) > mem_limit:
+        return 2**len(indices)/compression_ratio
+    else:
+        return 2**len(indices)
+def pairwise_cost(indices, comp_ixs, contracted_ixs=[],
+                  mem_limit=np.inf,
+                  compression_ratio=30,
+                 ):
+    """
+    Computes the cost of contracting a pair of tensors, assuming last
+    `contracted_ixs_count` indices are contrated
+    """
+    contracted_ixs_count = len(contracted_ixs)
+    all_indices = set().union(*indices) 
+    next_indices = list(all_indices)
+    next_indices.sort(key=int, reverse=True)
+    for i in contracted_ixs:
+        next_indices.remove(i)
+
+    if len(next_indices) > mem_limit or any(comp_ixs):
+        next_comp_ixs= next_indices[:-mem_limit]
+        rm_comp = set().union(*comp_ixs) - set(next_comp_ixs)
+        decompressions = 2**(len(rm_comp) + len(next_comp_ixs))
+        compressions = 2**len(next_comp_ixs)
+    else:
+        next_comp_ixs = []
+        decompressions = 0
+        compressions = 0
+    mem = 0
+    for ilist in [next_indices]+indices:
+        mem += tensor_memory(ilist, mem_limit, compression_ratio)
+
+    return (
+        next_indices,
+        next_comp_ixs,
+        Cost(
+            memory = mem,
+            flops = 2**len(all_indices),
+            width = len(next_indices),
+            compressions = compressions,
+            decompressions = decompressions,
+        )
+    )
+
+
+def bucket_contract_cost(indices, comp_ixs, contracted_indices, **kwargs):
+    """
+    Computes the cost of contracting a bucket of tensors
+
+    Args:
+        indices: indices of tensors in the bucket
+        comp_ixs: indices that are compressed
+        contracted_indices: indices that are contracted
+        **kwargs: passed to pairwise_cost
+    """
+    ixs, compixs = indices[0], comp_ixs[0]
+    costs = []
+    for i in range(1, len(indices)-1):
+        ixs, compixs, cost = pairwise_cost(
+            [ixs, indices[i]],
+            [compixs, comp_ixs[i]],
+            **kwargs
+        )
+        costs.append(cost)
+    # -- contract last two tensors
+    new_ixs, new_comp_ixs, cost = pairwise_cost(
+        [ixs, indices[-1]],
+        [compixs, comp_ixs[-1]],
+        contracted_ixs=contracted_indices,
+        **kwargs,
+    )
+    costs.append(cost)
+    new_ixs = set().union(*indices) - set(contracted_indices)
+    sum_cost = sum(costs[1:], costs[0])
+    sum_cost.width = len(new_ixs)
+    ## Naive Flops calculation
+    # sum_cost.flops = 2**len(set().union(*indices))*(len(indices)+1)
+    return new_ixs, new_comp_ixs, sum_cost
+
+def contract_with_cost(TN, comp_ixs, dual_TN, vertex,
+                       mem_limit=np.inf,
+                       compression_ratio=100):
+    """
+    Contracts vertex from TN
+    TN is a mapping from indices to [tensor]
+    """
+    tensors = TN[vertex]
+    # contract
+    tensors.sort(key=lambda t: len(dual_TN[t]))
+    indices = [dual_TN[t] for t in tensors]
+    comp_indices = [comp_ixs.get(t, []) for t in tensors]
+    result_ixs, compressed, cost = bucket_contract_cost(indices, comp_indices, [vertex],
+                                                        mem_limit=mem_limit,
+                                                        compression_ratio=compression_ratio
+                                                       )
+    # calculate current memory
+    for t_id, indices in dual_TN.items():
+        if t_id in tensors:
+            # these tensors are accounted in bucket_contract_cost
+            continue
+        cost.memory += tensor_memory(indices, mem_limit, compression_ratio)
+
+    # This can be random but should be unique
+    tensor_id = str(hex(id(vertex)))
+    comp_ixs[tensor_id] = compressed
+    remove_vertices_tensors(TN, dual_TN, [vertex], tensors)
+    # -- add result
+    for ix in result_ixs:
+        if TN.get(ix) is None:
+            TN[ix] = []
+        TN[ix].append(tensor_id)
+    dual_TN[tensor_id] = list(result_ixs)
+    # --
+    return cost
+
+
+def convert_TN_peo(tn, peo):
+    """
+    Convert qtensor.QtreeTensorNet to a hypergraph
+    relabel peo accordingly.
+    Args:
+        tn: qtensor.QtreeTensorNet
+        peo: list of indices
+    """
+    TN = tn2tn(tn)
+    relabel_dict = {int(p):i for i, p in enumerate(peo)}
+    ignored_vars = list(map(int, tn.bra_vars + tn.ket_vars))
+    peo = [x for x in peo if int(x) not in ignored_vars]
+
+    TN = {
+        relabel_dict[int(v)]: ix for v, ix in TN.items()
+    }
+    peo = [relabel_dict[int(p)] for p in peo]
+    return TN, peo
+
+def compressed_contraction_cost(tn, peo, mem_limit=np.inf, compression_ratio=100):
+    """
+    Compute the cost of a contraction with compression.
+    """
+    TN, peo = convert_TN_peo(tn, peo)
+    costs = []
+    dual_TN = dual_hg(TN)
+    comp_ixs = {}
+    for i in peo:
+        cost = contract_with_cost(TN, comp_ixs, dual_TN, i, mem_limit, compression_ratio)
+        costs.append(cost)
+    return costs
diff --git a/qtensor/compression/cusz/include/cli/analyzer.hh b/qtensor/compression/cusz/include/cli/analyzer.hh
new file mode 100644
index 00000000..8c58a71c
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/analyzer.hh
@@ -0,0 +1,278 @@
+/**
+ * @file analyzer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-03-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ANALYSIS_ANALYZER_HH
+#define ANALYSIS_ANALYZER_HH
+
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "../hf/hf_bookg.hh"
+#include "../hf/hf_codecg.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../utils/timer.hh"
+
+using std::cout;
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+enum class ExecutionPolicy { host, cuda_device };
+enum class AnalyzerMethod { thrust, cuda_native, stl };
+
+class Analyzer {
+    typedef struct ExtremaResult {
+        double max_val, min_val, rng;
+        double seconds;
+    } extrema_result_t;
+
+    typedef struct Compressibility {
+        size_t len;
+        struct {
+            double       entropy;
+            unsigned int top1_freq;
+            double       top1_prob;
+            double       dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); }
+            double       dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); }
+        } hist;
+        struct {
+            double r_lowerbound;
+            double avgb_lowerbound;
+            double r_upperbound;
+            double avgb_upperbound;
+        } huffman_theory;
+        struct {
+            double min_bitlen;
+            double avgb;
+        } huffman_stat;
+    } theory_t;
+
+    theory_t theory;
+
+   public:
+    Analyzer()  = default;
+    ~Analyzer() = default;
+
+    // TODO execution policy
+    template <typename T, ExecutionPolicy policy = ExecutionPolicy::host>
+    static std::vector<T> percentile100(T* in, size_t len)
+    {
+        std::vector<T> res;
+        auto           step = int(ceil(len / 100));
+
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device) {
+            // caveat: no residence check
+            thrust::sort(thrust::device, in, in + len);
+            T* htmp;
+            cudaMallocHost(&htmp, sizeof(T) * len);
+            cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(htmp[i]);
+            }
+            res.push_back(htmp[len - 1]);
+            cudaFreeHost(htmp);
+        }
+        else {  // fallback
+            std::sort(in, in + len);
+            for (auto i = 0; i < len; i += step) {  //
+                res.push_back(in[i]);
+            }
+            res.push_back(in[len - 1]);
+        }
+
+        return res;
+    }
+
+    template <typename Data, ExecutionPolicy policy, AnalyzerMethod method>
+    static extrema_result_t get_maxmin_rng(Data* d_data, size_t len)
+    {
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) {
+            auto t0 = hires::now();
+            // ------------------------------------------------------------
+            thrust::device_ptr<Data> g_ptr = thrust::device_pointer_cast(d_data);
+
+            auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len);  // excluding padded
+            auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len);  // excluding padded
+
+            double max_val = *max_el_loc;
+            double min_val = *min_el_loc;
+            double rng     = max_val - min_val;
+            // ------------------------------------------------------------
+            auto t1 = hires::now();
+
+            return extrema_result_t{max_val, min_val, rng, static_cast<duration_t>(t1 - t0).count()};
+        }
+        else {
+            throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented.");
+        }
+    }
+
+    template <typename UInt, ExecutionPolicy policy, AnalyzerMethod method>
+    static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins)
+    {
+        // TODO static check UInt
+        if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) {
+            float dummy;
+            launch_histogram(data, data_len, freq, num_bins, dummy);
+        }
+        else {
+            // TODO static check
+            throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented.");
+        }
+    }
+
+    Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size)
+    {
+        auto   len       = std::accumulate(h_freq, h_freq + dict_size, 0u);  // excluding outlier
+        auto   top1_freq = *std::max_element(h_freq, h_freq + dict_size);
+        double top1_prob = (1.0 * top1_freq) / (1.0 * len);
+        double entropy   = 0.0;
+        for (auto i = 0; i < dict_size; i++) {
+            double p = h_freq[i] / (1.0 * len);
+            if (p != 0) entropy += -std::log2(p) * p;
+        }
+        double r_lowerbound    = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob));
+        double r_upperbound    = top1_prob + 0.086;  // [Gallager 78]
+        double avgb_lowerbound = entropy + r_lowerbound;
+        double avgb_upperbound = entropy + r_upperbound;
+
+        // dropout
+        // auto equiv_bitlen_dropout_2x   = 64 * (1 - top1_prob);
+        // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob);
+
+        // record
+        theory.len                            = len;
+        theory.hist.entropy                   = entropy;
+        theory.hist.top1_freq                 = top1_freq;
+        theory.hist.top1_prob                 = top1_prob;
+        theory.huffman_theory.r_lowerbound    = r_lowerbound;
+        theory.huffman_theory.r_upperbound    = r_upperbound;
+        theory.huffman_theory.avgb_lowerbound = avgb_lowerbound;
+        theory.huffman_theory.avgb_upperbound = avgb_upperbound;
+
+        return *this;
+    };
+
+    template <typename Huff>
+    Analyzer&
+    get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins)
+    {
+        // real-bitlen, for reference only, not part of workflow
+        std::vector<Huff>         v_canon_cb(h_codebook, h_codebook + num_bins);
+        std::vector<unsigned int> v_freq(h_freq, h_freq + num_bins);
+
+        // TODO somewhere explicitly state that null codeword is of length 0xff
+        std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) {
+            auto a_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&a)->bits;
+            auto b_bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&b)->bits;
+            return a_bits < b_bits;
+        });
+        std::sort(v_freq.begin(), v_freq.end(), std::greater<Huff>());
+
+        double real_avgb = 0.0;
+        for (auto i = 0; i < num_bins; i++) {
+            if (v_freq[i] != 0) {
+                auto bits = reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb[i])->bits;
+                real_avgb += v_freq[i] * bits;
+            }
+        }
+        real_avgb /= len;
+
+        theory.huffman_stat.avgb = real_avgb;
+        theory.huffman_stat.min_bitlen =
+            reinterpret_cast<struct PackedWordByWidth<sizeof(Huff)>*>(&v_canon_cb.at(0))->bits;
+
+        return *this;
+    }
+
+    Analyzer&
+    print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32)
+    {
+        cout << "\n\e[31m";  // extra linebreak on start
+
+        cout << "* Derived from histogram:" << '\n';
+        cout << "  - len (freq sum):\t" << theory.len << '\n';
+        cout << "  - entropy H(X):\t" << theory.hist.entropy << '\n';
+        cout << "  - most likely freq:\t" << theory.hist.top1_freq << '\n';
+        cout << "  - most likely prob (p1):\t" << theory.hist.top1_prob << '\n';
+        cout << '\n';
+
+        if (theory.hist.top1_prob < 0.4) {
+            cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n';
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+        }
+        else {
+            cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n';
+            cout << "  - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n';
+            cout << "  - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << "  - est. CR upper bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n';
+            cout << '\n';
+
+            cout << "* Compressibility lower bound is for reference only." << '\n';
+            cout << "  - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n';
+            cout << "  - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound
+                 << '\n';
+            cout << "  - est. CR lower bound (arbitrary p1):\t"
+                 << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n';
+            cout << '\n';
+
+            if (print_dropout) {
+                auto dropout_equiv_bitlen_2x   = theory.hist.dropout_equiv_bitlen_2x();
+                auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x();
+                // TODO determine path, print log
+                cout << "* Considering dropout:" << '\n';
+                cout << "  - dropout at 1.0x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n';
+                cout << "    | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << "  - dropout at 0.5x metadata overhead" << '\n';
+                cout << "    | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n';
+                cout << "    | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n';
+                cout << "    | bitlen_dropout <= bitlen_enc?\t"
+                     << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n';
+                cout << '\n';
+            }
+        }
+
+        if (print_huffman_stat) {
+            cout << "* From Huffman codebook:" << '\n';
+            cout << "  - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n';
+            cout << "  - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n';
+            cout << '\n';
+        }
+        cout << "\e[0m";
+
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/document.hh b/qtensor/compression/cusz/include/cli/document.hh
new file mode 100644
index 00000000..ed68bdf5
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/document.hh
@@ -0,0 +1,272 @@
+/**
+ * @file document.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-22
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef ARGUMENT_PARSER_DOCUMENT_HH
+#define ARGUMENT_PARSER_DOCUMENT_HH
+
+#include <regex>
+#include <string>
+
+
+const std::string fmt_b("\e[1m");
+const std::string fmt_0("\e[0m");
+
+const std::regex  bful("@(.*?)@");
+const std::string bful_text("\e[1m\e[4m$1\e[0m");
+const std::regex  bf("\\*(.*?)\\*");
+const std::string bf_text("\e[1m$1\e[0m");
+const std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+const std::string ul_text("\e[4m$1\e[0m");
+const std::regex  red(R"(\^\^(.*?)\^\^)");
+const std::string red_text("\e[31m$1\e[0m");
+
+std::string  //
+Format(const std::string& s)
+{
+    auto a = std::regex_replace(s, bful, bful_text);
+    auto b = std::regex_replace(a, bf, bf_text);
+    auto c = std::regex_replace(b, ul, ul_text);
+    auto d = std::regex_replace(c, red, red_text);
+    return d;
+}
+
+static const char cusz_short_doc[] =
+    // "cusz, version [placeholder]\n"
+    "\n"
+    "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] "
+    "...\n"
+    "\n"
+    "  z : zip/compress\n"
+    "  x : unzip/decompress\n"
+    "  r : dryrun\n"
+    "  h : print full-length help document\n"
+    "\n"
+    "  i file  : path to input datum\n"
+    "  t dtype : f32 or fp4 (to be updated)\n"
+    "  m mode  : compression mode; abs, r2r\n"
+    "  e eb    : error bound; default 1e-4\n"
+    "  l size  : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n"
+    // "  p pred  : select predictor from \"lorenzo\" and \"spline3d\"\n"
+    "\n"
+    "  config list:\n"
+    "      syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "      + eb     error bound\n"
+    "      + radius The number of quant-codes is 2x radius.\n"
+    "      + demo  load predefined lengths for demo datasets\n"
+    "          - skipping \"-l x[,y[,z]]\"\n"
+    "          - (1D) hacc  hacc1b  (2D) cesm  exafel\n"
+    "          - (3D) hurricane  nyx-s  nyx-m  qmc  qmcpre  rtm  parihaka\n"
+    "      + anchor (on|off)\n"
+    // "      + pipeline auto, binary, radius\n"
+    "      example: \"--config demo=cesm,radius=512\"\n"
+    "  report list: \n"
+    "      syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n"
+    "      keyworkds: time, quality\n"
+    "      example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "example:\n"
+    "   CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "   cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "   cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n"
+    "\n"
+    "\"cusz -h\" for details.\n";
+
+static const char cusz_full_doc[] =
+    "*NAME*\n"
+    "        cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n"
+    "        Lowercased \"*cusz*\" is the command."
+    "\n"
+    "*SYNOPSIS*\n"
+    "        The basic use is listed below,\n"
+    "        *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* "
+    "time\n"
+    //   cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n
+    "             ^^------ ------ ----------- ------------------------------- ------------  |  ^^\n"
+    "             ^^ dtype  mode  error bound            input file           low-to-high  zip ^^\n"
+    "\n"
+    "        *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* "
+    "time\n"
+    //       cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report
+    //       time\n"
+    "             ^^-------------------------------------  |   ^^\n"
+    "             ^^            compressed file          unzip ^^\n"
+    "\n"
+    "        *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n"
+    "        *cusz* *-i* [basename].cusza *-x*\n"
+    "\n"
+    "*OPTIONS*\n"
+    "    *Mandatory* (zip and dryrun)\n"
+    "        *-z* or *--compress* or *--*@z@*ip*\n"
+    "        *-r* or *--dry-*@r@*un*\n"
+    "                No lossless Huffman codec. Only to get data quality summary.\n"
+    "                In addition, quant. rep. and dict. size are retained\n"
+    "\n"
+    "        *-m* or *--*@m@*ode* <abs|r2r>\n"
+    "                Specify error-controlling mode. Supported modes include:\n"
+    "                _abs_: absolute mode, eb = input eb\n"
+    "                _r2r_: relative-to-value-range mode, eb = input eb x value range\n"
+    "\n"
+    "        *-e* or *--eb* or *--error-bound* [num]\n"
+    "                Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [file]\n"
+    "\n"
+    "        *-d* or *--dict-size* [256|512|1024|...]\n"
+    "                Specify dictionary size/quantization bin number.\n"
+    "                Should be a power-of-2.\n"
+    "\n"
+    "        *-l* [x[,y[,z]]]   Specify (1|2|3)D data size, with dimensions from low to high.\n"
+    "\n"
+    "    *Mandatory* (unzip)\n"
+    "        *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n"
+    "\n"
+    "        *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n"
+    "\n"
+    "    *Additional*\n"
+    "        *-p* or *--*@p@*redictor*\n"
+    "                Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n"
+    "        *--origin* or *--compare* /path/to/origin-datum\n"
+    "                For verification & get data quality evaluation.\n"
+    "        *--opath*  /path/to\n"
+    "                Specify alternative output path.\n"
+    "\n"
+    "    *Modules*\n"
+    "        *--skip* _module-1_,_module-2_,...,_module-n_,\n"
+    "                Disable functionality modules. Supported module(s) include:\n"
+    "                _huffman_  Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n"
+    "                _write2disk_  Skip write decompression data.\n"
+    //    "\n"
+    //    "        *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n"
+    //    "                Enable preprocessing. Supported preprocessing method(s) include:\n"
+    //    "                _binning_  Downsampling datum by 2x2 to 1.\n"
+    "\n"
+    "    *Print Report to stdout*\n"
+    "        *--report* (option=on/off)-list\n"
+    "                Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n"
+    "                Keyworkds: time  quality  compressibility\n"
+    "                Example: \"--report time\", \"--report time=off\"\n"
+    "\n"
+    "    *Demonstration*\n"
+    "        *-h* or *--help*\n"
+    "                Get help documentation.\n"
+    "\n"
+    //    "        *-V* or *--verbose*\n"
+    //    "                Print host and device information for diagnostics.\n"
+    //    "\n"
+    //    "        *-M* or *--meta*\n"
+    //    "                Get archive metadata. (TODO)\n"
+    "\n"
+    "    *Advanced Runtime Configuration*\n"
+    "        *--demo* [demo-dataset]\n"
+    "                Use demo dataset, will omit given dimension(s). Supported datasets include:\n"
+    "                1D: _hacc_  _hacc1b_    2D: _cesm_  _exafel_\n"
+    "                3D: _hurricane_  _nyx-s_  _nyx-m_  _qmc_  _qmcpre_  _rtm_  _parihaka_\n"
+    "\n"
+    "        *-c* or *--config* (option=value)-list\n"
+    "               Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n"
+    "                   + *eb*=<val>    error bound\n"
+    "                   + *cap*=<val>   capacity, number of quant-codes\n"
+    "                   + *demo*=<val>  skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n"
+    "\n"
+    "               Other internal parameters:\n"
+    "                   + *quantbyte*=<1|2>\n"
+    "                       Specify quantization code representation.\n"
+    "                       Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffbyte*=<4|8>\n"
+    "                       Specify Huffman codeword representation.\n"
+    "                       Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n"
+    "                       ^^Manually specifying this may not result in optimal memory footprint.^^\n"
+    "                   + *huffchunk*=[256|512|1024|...]\n"
+    "                       Manually specify chunk size for Huffman codec, overriding autotuning.\n"
+    "                       Should be a power-of-2 that is sufficiently large.\n"
+    "                       ^^This affects Huffman decoding performance significantly.^^\n"
+    "\n"
+    "*EXAMPLES*\n"
+    "    *Demo Datasets*\n"
+    "        Set a *shell variable*:\n"
+    "        export PATH=$(pwd)/bin:$PATH\n"
+    "        CESM=./data/cesm-CLDHGH-3600x1800\n"
+    "        HURR=./data/hurr-CLOUDf48-500x500x100\n"
+    "\n"
+    "        *CESM* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n"
+    "        cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n"
+    "\n"
+    "        *CESM* example with specified output path:\n"
+    "        mkdir data2 data3\n"
+    "        ^^# zip, output to `data2`^^\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n"
+    "        ^^# unzip, in situ^^\n"
+    "        cusz -i ${CESM}.cusza -x && ls data2\n"
+    "        ^^# unzip, output to `data3`^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n"
+    "        ^^# unzip, output to `data3`, compare to the original datum^^\n"
+    "        cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n"
+    "\n"
+    "        *Hurricane Isabel* example:\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n"
+    "        cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n"
+    "        cusz -i ${HURR}.cusza -x\n"
+    "\n";
+
+// TODO
+// "        *EXAFEL* example:\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n"
+// "        cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning "
+// "--skip huffman\n"
+// "        cusz -i ./data/exafel-59200x388.BN.cusza -x\n";
+
+static const char huff_re_short_doc[] =
+    "\n"
+    "OVERVIEW: Huffman submodule as standalone program\n"  // TODO from this line on
+    "\n"
+    "USAGE:\n"
+    "  The basic use with demo datum is listed below,\n"
+    "    ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n"
+    "        -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n"
+    "  or shorter\n"
+    "    ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n"
+    "            ^  ^  ^ --------------------------- -------------- ----- ----- ------- -------\n"
+    "            |  |  |       input datum file         dimension   input Huff. Huff.   codebook\n"
+    "          enc dec verify                                       rep.  rep.  chunk   size\n"
+    "\n"
+    "EXAMPLES\n"
+    "  Essential:\n"
+    "    ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n"
+    "    have to input dimension, and higher dimension for a multiplication of each dim.,\n"
+    "    as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n"
+    "\n";
+
+static const char doc_dim_order[] =
+    "\n"
+    "  Input dimension follows low-to-high (e.g., x-y-z) order.\n"
+    "  Taking 2D CESM-ATM as an example, \n"
+    "\n"
+    "  |<------------------------- x 3600 --------------------------->|    \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "  |                                                              |  ^ \n"
+    "  |                                                              |  | \n"
+    "  |              CESM-ATM:    1800x3600 (y-x order)              |  | \n"
+    "  |              datum name:  <field>_1800_3600                  |  y \n"
+    "  |                                                              | 1800 \n"
+    "  |              input:       -l 3600,1800                       |  | \n"
+    "  |              input order: -l [x,y]                           |  | \n"
+    "  |                                                              |  | \n"
+    "  |                                                              |  v \n"
+    "  +--------------------------------------------------------------+  - \n"
+    "\n"
+    "  Taking 3D Hurricane as another example, whose dimensions are\n"
+    "  100x500x500, the input is \"-l 500,500,100\".\n";
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/quality_viewer.hh b/qtensor/compression/cusz/include/cli/quality_viewer.hh
new file mode 100644
index 00000000..eb8a27c2
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/quality_viewer.hh
@@ -0,0 +1,163 @@
+/**
+ * @file quality_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ * @deprecated 0.3.2
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef QUALITY_VIEWER_HH
+#define QUALITY_VIEWER_HH
+
+// 22-11-20 would fail in cxxapi.cu if deleted
+#include <thrust/equal.h>
+
+#include "../common/capsule.hh"
+#include "../common/definition.hh"
+#include "../header.h"
+#include "../stat/compare_gpu.hh"
+#include "verify.hh"
+
+namespace cusz {
+
+const static auto HOST        = cusz::LOC::HOST;
+const static auto DEVICE      = cusz::LOC::DEVICE;
+const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+struct QualityViewer {
+    template <typename Data>
+    static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false)
+    {
+        auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)");
+        auto bytes   = (s->len * sizeof(Data) * 1.0);
+
+        auto println = [](const char* s, double n1, double n2, double n3, double n4) {
+            printf("  %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4);
+        };
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        auto is_fp = std::is_same<Data, float>::value or std::is_same<Data, double>::value ? const_cast<char*>("yes")
+                                                                                           : const_cast<char*>("no");
+        printf("\nquality metrics %s:\n", checker.c_str());
+
+        printhead("", "data-len", "data-byte", "fp-type?", "");
+        printf("  %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp);
+
+        printhead("", "min", "max", "rng", "std");
+        println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std);
+        println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std);
+
+        printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG");
+        println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel);
+
+        printhead("", "CR", "NRMSE", "cross-cor", "PSNR");
+        println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR);
+
+        // printf("\n");
+    };
+
+    static void print_metrics_auto(double* lag1_cor, double* lag2_cor)
+    {
+        auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) {
+            printf("  \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5);
+        };
+
+        printhead("", "lag1-cor", "lag2-cor", "", "");
+        printf("  %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor);
+        printf("\n");
+    };
+
+    template <typename T>
+    static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0)
+    {
+        // cross
+        auto stat_x = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_x, reconstructed, origin, len);
+        print_metrics_cross<T>(stat_x, compressed_bytes, true);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        psz::thrustgpu_assess_quality<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+    }
+
+    template <typename T>
+    static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true)
+    {
+        auto stat = new cusz_stats;
+        T*   reconstructed;
+        T*   origin;
+        if (not from_device) {
+            reconstructed = _d1;
+            origin        = _d2;
+        }
+        else {
+            printf("allocating tmp space for CPU verification\n");
+            auto bytes = sizeof(T) * len;
+            cudaMallocHost(&reconstructed, bytes);
+            cudaMallocHost(&origin, bytes);
+            cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost);
+            cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost);
+        }
+        cusz::verify_data<T>(stat, reconstructed, origin, len);
+        print_metrics_cross<T>(stat, compressed_bytes, false);
+
+        auto stat_auto_lag1 = new cusz_stats;
+        verify_data<T>(stat_auto_lag1, origin, origin + 1, len - 1);
+        auto stat_auto_lag2 = new cusz_stats;
+        verify_data<T>(stat_auto_lag2, origin, origin + 2, len - 2);
+
+        print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff);
+
+        if (from_device) {
+            if (reconstructed) cudaFreeHost(reconstructed);
+            if (origin) cudaFreeHost(origin);
+        }
+    }
+
+    template <typename T>
+    static void load_origin(string const& fname, Capsule<T>& origin)
+    {
+        origin.mallochost().malloc().fromfile(fname);
+    }
+
+    template <typename T>
+    static void view(header_t header, Capsule<T>& xdata, Capsule<T>& cmp, string const& compare)
+    {
+        auto len             = ConfigHelper::get_uncompressed_len(header);
+        auto compressd_bytes = ConfigHelper::get_filesize(header);
+
+        auto compare_on_gpu = [&]() {
+            cmp.mallochost().malloc().fromfile(compare).host2device();
+            echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes);
+            cmp.freehost().free();
+        };
+
+        auto compare_on_cpu = [&]() {
+            cmp.mallochost().fromfile(compare);
+            xdata.device2host();
+            echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes);
+            cmp.freehost();
+        };
+
+        if (compare != "") {
+            auto gb = 1.0 * sizeof(T) * len / 1e9;
+            if (gb < 0.8)
+                compare_on_gpu();
+            else
+                compare_on_cpu();
+        }
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query.hh b/qtensor/compression/cusz/include/cli/query.hh
new file mode 100644
index 00000000..c09326c8
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/query.hh
@@ -0,0 +1,71 @@
+/**
+ * @file query.hh
+ * @author Jiannan Tian
+ * @brief query machine information
+ * @version 0.1.3
+ * @date 2020-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef QUERY_HH
+#define QUERY_HH
+
+#include <array>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "query_dev.hh"
+
+struct Diagnostics {
+    static std::string ExecShellCommand(const char* cmd)
+    {
+        std::array<char, 128>                    buffer;
+        std::string                              result;
+        std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+        if (!pipe) { throw std::runtime_error("popen() failed!"); }
+        while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); }
+        return result;
+    }
+
+    static void GetMachineProperties()
+    {
+        std::vector<std::string> v;
+        std::cout << "host information: " << std::endl;
+
+        auto cpuinfo = ExecShellCommand(  //
+            std::string("cat /proc/cpuinfo "
+                        "| grep \"model name\" "
+                        "| head -n 1 "
+                        "| awk -F': ' '{print $NF}'")
+                .c_str());
+        std::cout << "  cpu model\t" << cpuinfo;
+
+        auto meminfo = ExecShellCommand(  //
+            std::string("cat /proc/meminfo"
+                        "| grep \"MemTotal\" "
+                        "| awk -F' ' '{print $2\" \"$3}'")
+                .c_str());
+
+        std::cout << "  memory size\t" << meminfo;
+
+        auto endianness = ExecShellCommand(  //
+            std::string("lscpu "
+                        "| grep Endian "
+                        "| awk -F'  ' '{print $NF}'")
+                .c_str());
+
+        std::cout << "  byte order\t" << endianness;
+        printf("\n");
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/query_dev.hh b/qtensor/compression/cusz/include/cli/query_dev.hh
new file mode 100644
index 00000000..34a429ea
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/query_dev.hh
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample queries the properties of the CUDA devices present in the system
+ * via CUDA Runtime API. */
+
+/**
+ * @brief Get the Device Property object
+ * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp`
+ */
+
+struct GpuDiagnostics {
+    static void GetDeviceProperty()
+    {
+        int         num_dev  = 0;
+        cudaError_t error_id = cudaGetDeviceCount(&num_dev);
+
+        if (error_id != cudaSuccess) {
+            printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
+            exit(EXIT_FAILURE);
+        }
+        if (num_dev == 0) { printf("NO CUDA device detected.\n"); }
+        int dev, driver_ver = 0, runtime_ver = 0;
+
+        for (dev = 0; dev < num_dev; ++dev) {
+            cudaSetDevice(dev);
+            cudaDeviceProp dev_prop;
+            cudaGetDeviceProperties(&dev_prop, dev);
+            printf("device #%d, %s: \n", dev, dev_prop.name);
+
+            cudaDriverGetVersion(&driver_ver);
+            cudaRuntimeGetVersion(&runtime_ver);
+            printf(
+                "  driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000,
+                (runtime_ver % 100) / 10);
+            printf("  compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor);
+            printf("  global memory:\t%.0f MiB\n", static_cast<float>(dev_prop.totalGlobalMem / 1048576.0f));
+            printf("  constant memory:\t%zu bytes\n", dev_prop.totalConstMem);
+            printf("  shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock);
+            printf("  shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor);
+            printf("  registers per block:\t%d\n", dev_prop.regsPerBlock);
+        }
+        printf("\n");
+    }
+};
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
new file mode 100644
index 00000000..52baac95
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh
@@ -0,0 +1,109 @@
+/**
+ * @file timerecord_viewer.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-09
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_TIMERECORD_VIEWER_HH
+#define CLI_TIMERECORD_VIEWER_HH
+
+#include <algorithm>
+#include "../common/definition.hh"
+
+namespace cusz {
+
+struct TimeRecordViewer {
+    static float get_throughput(float milliseconds, size_t bytes)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return bytes / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t bytes)
+    {
+        if (timer == 0.0) return;
+
+        auto t = get_throughput(timer, bytes);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static double get_total_time(timerecord_t r)
+    {
+        double total = 0.0;
+        std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); });
+        return total;
+    }
+    static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0)
+    {
+        auto report_cr = [&]() {
+            auto cr = 1.0 * bytes / compressed_bytes;
+            if (compressed_bytes != 0) printf("  %-*s %.2f\n", 20, "compression ratio", cr);
+        };
+
+        TimeRecord reflow;
+
+        {  // reflow
+            TimeRecordTuple book_tuple;
+
+            auto total_time    = get_total_time(r);
+            auto subtotal_time = total_time;
+
+            for (auto& i : *r) {
+                auto item = std::string(std::get<0>(i));
+                if (item == "book") {
+                    book_tuple = i;
+                    subtotal_time -= std::get<1>(i);
+                }
+                else {
+                    reflow.push_back(i);
+                }
+            }
+            reflow.push_back({const_cast<const char*>("(subtotal)"), subtotal_time});
+            printf("\e[2m");
+            reflow.push_back(book_tuple);
+            reflow.push_back({const_cast<const char*>("(total)"), total_time});
+            printf("\e[0m");
+        }
+
+        printf("\n(c) COMPRESSION REPORT\n");
+        report_cr();
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+
+    static void view_decompression(timerecord_t r, size_t bytes)
+    {
+        printf("\n(d) deCOMPRESSION REPORT\n");
+
+        auto total_time = get_total_time(r);
+        (*r).push_back({const_cast<const char*>("(total)"), total_time});
+
+        ReportHelper::println_throughput_tablehead();
+        for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes);
+
+        printf("\n");
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/cli/verify.hh b/qtensor/compression/cusz/include/cli/verify.hh
new file mode 100644
index 00000000..621a0077
--- /dev/null
+++ b/qtensor/compression/cusz/include/cli/verify.hh
@@ -0,0 +1,87 @@
+#ifndef ANALYSIS_VERIFY_HH
+#define ANALYSIS_VERIFY_HH
+
+/**
+ * @file verify.hh
+ * @author Jiannan Tian
+ * @brief Verification of decompressed data.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-09-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "../common.hh"
+#include "../cusz/type.h"
+
+using namespace std;
+
+namespace cusz {
+
+template <typename T>
+void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common.hh b/qtensor/compression/cusz/include/common.hh
new file mode 100644
index 00000000..b2741954
--- /dev/null
+++ b/qtensor/compression/cusz/include/common.hh
@@ -0,0 +1,19 @@
+/**
+ * @file common.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_HH
+#define CUSZ_COMMON_HH
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/common/capsule.hh b/qtensor/compression/cusz/include/common/capsule.hh
new file mode 100644
index 00000000..be1f1f1b
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/capsule.hh
@@ -0,0 +1,402 @@
+/**
+ * @file capsule.hh
+ * @author Jiannan Tian
+ * @brief Simple data analysis (header)
+ * @version 0.2.3
+ * @date 2020-11-03
+ * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08
+ * @deprecated 0.3.2
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CAPSULE_HH
+#define CAPSULE_HH
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#include <cuda_runtime.h>
+#include <driver_types.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "../stat/compare_gpu.hh"
+// #include "../utils/io.hh"
+#include "../utils/timer.hh"
+#include "definition.hh"
+
+template <typename T>
+class Capsule {
+   private:
+    // variables
+    struct {
+        bool hptr{false}, dptr{false}, uniptr{false};
+    } alloc_status;
+
+    T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr};
+
+    uint32_t _len{0};
+    dim3     _len3{1, 1, 1}, _stride3{1, 1, 1};
+
+    std::string name;
+
+    // logging setup; standalone
+    const std::string LOG_NULL      = "      ";
+    const std::string LOG_INFO      = "  ::  ";
+    const std::string LOG_ERR       = " ERR  ";
+    const std::string LOG_WARN      = "WARN  ";
+    const std::string LOG_DBG       = " dbg  ";
+    const std::string LOG_EXCEPTION = "  !!  ";
+
+    // https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+    template <typename S>
+    void build_string(std::ostream& o, S t)
+    {
+        o << t << " ";
+    }
+
+    template <typename S, typename... Args>
+    void build_string(std::ostream& o, S t, Args... args)  // recursive variadic function
+    {
+        build_string(o, t);
+        build_string(o, args...);
+    }
+
+    template <typename... Args>
+    void LOGGING(const std::string& log_head, Args... args)
+    {
+        std::ostringstream oss;
+        oss << log_head;
+        build_string(oss, args...);
+
+        oss.seekp(0, std::ios::end);
+        std::stringstream::pos_type offset = oss.tellp();
+        if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+        std::cout << oss.str() << std::endl;                // print content
+        if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+    }
+
+    // IO
+    int fs2mem(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(T) * num_els;
+
+        std::ifstream ifs(fname, std::ios::binary | std::ios::in);
+        if (not ifs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+        ifs.read(reinterpret_cast<char*>(array), std::streamsize(bytes));
+        ifs.close();
+
+        return 0;
+    }
+
+    int mem2fs(const char* fname, void* array, size_t num_els)
+    {
+        auto bytes = sizeof(type) * num_els;
+
+        std::ofstream ofs(fname, std::ios::binary | std::ios::out);
+        if (not ofs.is_open()) {
+            std::cerr << "fail to open " << fname << std::endl;
+            return -1;
+        }
+
+        ofs.write(reinterpret_cast<const char*>(array), std::streamsize(bytes));
+        ofs.close();
+
+        return 0;
+    }
+
+    std::string ERRSTR_BUILDER(std::string func, std::string msg)
+    {
+        return "[Capsule(\"" + name + "\")::" + func + "] " + msg;
+    }
+
+    void check_len(std::string funcname)
+    {
+        if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0");
+    }
+
+    std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior")
+    {  //
+        return ERRSTR_BUILDER(func, "undefined behavior");
+    }
+
+   public:
+    using type = T;
+
+    // TODO rule of n
+    // constructor
+    Capsule() = default;
+    Capsule(const std::string _str) : name(_str){};
+    Capsule(uint32_t len, const std::string _str = std::string("<unnamed>")) : _len(len), name(_str) {}
+    Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("<unnamed>")) : name(_str)
+    {
+        _len3 = dim3(x, y, z);
+        _len  = x * y * z;
+    }
+
+    ~Capsule()
+    {
+        // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly.
+        // if (alloc_status.hptr) cudaFreeHost(_hptr);
+
+        if (alloc_status.dptr) cudaFree(_dptr);
+        if (alloc_status.uniptr) cudaFree(_uniptr);
+    }
+
+    // getter start --------------------
+    T*& dptr() { return _dptr; }
+    T*& hptr() { return _hptr; }
+    T*& uniptr() { return _uniptr; }
+
+    uint32_t len() const { return _len; }
+    dim3     len3() const { return _len3; }
+    dim3     stride3() const { return _stride3; }
+    // 1D
+    T& dptr(uint32_t i) { return _dptr[i]; }
+    T& hptr(uint32_t i) { return _hptr[i]; }
+    T& uniptr(uint32_t i) { return _uniptr[i]; }
+    // 2D
+    T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; }
+    T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; }
+    T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; }
+    // 3D
+    T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; }
+    T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; }
+    // getter end -----------------------
+
+    // setter start ---------------------
+    Capsule& set_hptr(T* ptr)
+    {
+        _hptr = ptr, alloc_status.hptr = true;
+        return *this;
+    }
+    Capsule& set_dptr(T* ptr)
+    {
+        _dptr = ptr, alloc_status.dptr = true;
+        return *this;
+    }
+    Capsule& set_uniptr(T* ptr)
+    {
+        _uniptr = ptr, alloc_status.uniptr = true;
+        return *this;
+    }
+
+    // variable len
+    Capsule& set_len(uint32_t len)
+    {
+        if (len <= 0) throw std::runtime_error("length must be greater than 0");
+        _len = len;
+        return *this;
+    }
+
+    Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1)
+    {
+        if (x == 1) throw std::runtime_error("x must be > 1.");
+        if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero.");
+
+        _len3    = dim3(x, y, z);
+        _stride3 = dim3(1, x, x * y);
+        _len     = x * y * z;
+
+        return *this;
+    }
+    // setter end ----------------------
+
+    // debug
+    void debug()
+    {
+        printf("Capsule debugging information\n");
+        printf("  name   : %s\n", name.c_str());
+        printf("  len    : %u\n", len());
+        printf("  hptr   : %s\n", alloc_status.hptr ? "set" : "not set");
+        printf("  dptr   : %s\n", alloc_status.dptr ? "set" : "not set");
+        printf("  uniptr : %s\n", alloc_status.uniptr ? "set" : "not set");
+    }
+
+    // for debugging
+    Capsule& set_name(std::string _str)
+    {
+        name = _str;
+        return *this;
+    }
+
+    // IO
+    Capsule& fromfile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set"));
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0"));
+
+        auto a = hires::now();
+        fs2mem(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    Capsule& tofile(std::string fname, double* time = nullptr)
+    {
+        if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); }
+        if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0"));
+
+        auto a = hires::now();
+        mem2fs(fname.c_str(), _hptr, _len);
+        auto z = hires::now();
+
+        if (time) *time = static_cast<duration_t>(z - a).count();
+
+        return *this;
+    }
+
+    uint32_t nbyte() const { return _len * sizeof(T); }
+
+    // memcpy h2d, synchronous
+    Capsule& host2device()
+    {
+        check_len("host2device");
+
+        cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice);
+        return *this;
+    }
+    // memcpy d2h, synchronous
+    Capsule& device2host()
+    {
+        check_len("device2host");
+
+        cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost);
+        return *this;
+    }
+    // memcpy h2d, asynchronous
+    Capsule& host2device_async(cudaStream_t stream)
+    {
+        check_len("host2device_async");
+
+        cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream);
+        return *this;
+    }
+    // memcpy d2h, asynchronous
+    Capsule& device2host_async(cudaStream_t stream)
+    {
+        check_len("device2host_async");
+
+        cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream);
+        return *this;
+    }
+    // shorthand
+    Capsule& h2d() { return host2device(); }
+    Capsule& d2h() { return device2host(); }
+    Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); }
+    Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); }
+
+    // cudaMalloc wrapper
+    Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("malloc");
+
+        if (alloc_status.dptr)
+            LOGGING(LOG_WARN, "already allocated on device");
+        else {
+            cudaMalloc(&_dptr, nbyte());
+            cudaMemset(_dptr, memset_val, nbyte());
+            alloc_status.dptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocHost wrapper, pinned
+    Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallochost");
+
+        if (alloc_status.hptr)
+            LOGGING(LOG_WARN, "already allocated on host");
+        else {
+            cudaMallocHost(&_hptr, nbyte());
+            memset(_hptr, memset_val, nbyte());
+            alloc_status.hptr = true;
+        }
+        return *this;
+    }
+    // cudaMallocManaged wrapper
+    Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0)
+    {
+        check_len("mallocmanaged");
+
+        if (alloc_status.uniptr)
+            LOGGING(LOG_WARN, "already allocated as unified");
+        else {
+            cudaMallocManaged(&_uniptr, nbyte());
+            cudaMemset(_uniptr, memset_val, nbyte());
+            alloc_status.uniptr = true;
+        }
+        return *this;
+    }
+    // cudaFree wrapper
+    Capsule& free()
+    {
+        if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null"));
+        cudaFree(_dptr);
+        alloc_status.dptr = false;
+        return *this;
+    }
+    // cudaFreeHost wrapper
+    Capsule& freehost()
+    {
+        if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null"));
+        cudaFreeHost(_hptr);
+        alloc_status.hptr = false;
+        return *this;
+    }
+    // cudaFree wrapper, but for unified memory
+    Capsule& freemanaged()
+    {
+        if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null"));
+        cudaFree(_uniptr);
+        alloc_status.uniptr = false;
+        return *this;
+    }
+
+   private:
+    double maxval, minval, rng;
+
+   public:
+    double get_maxval() { return maxval; }
+    double get_minval() { return minval; }
+    double get_rng() { return rng; }
+
+    // data scan
+    Capsule& prescan(double& max_value, double& min_value, double& rng)
+    {
+        // may not work for _uniptr
+        T result[4];
+        psz::thrustgpu_get_extrema_rawptr<T>(_dptr, _len, result);
+
+        min_value = result[0];
+        max_value = result[1];
+        rng       = max_value - min_value;
+
+        return *this;
+    }
+    // data scan
+    Capsule& prescan()
+    {
+        prescan(maxval, minval, rng);
+        return *this;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/configs.hh b/qtensor/compression/cusz/include/common/configs.hh
new file mode 100644
index 00000000..d9a0bd39
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/configs.hh
@@ -0,0 +1,354 @@
+/**
+ * @file configs.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-26
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_CONFIGS_HH
+#define CUSZ_COMMON_CONFIGS_HH
+
+#include <cuda_runtime.h>
+#include <cxxabi.h>
+#include <cmath>
+#include <fstream>
+#include <limits>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../header.h"
+#include "definition.hh"
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+struct Reinterpret1DTo2D {
+    template <typename T>
+    static T get_square_size(T len)
+    {
+        return static_cast<T>(ceil(sqrt(len)));
+    }
+};
+
+struct Align {
+    template <cusz::ALIGNDATA ad = cusz::ALIGNDATA::NONE>
+    static size_t get_aligned_datalen(size_t len)
+    {
+        if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len;
+        if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) {
+            auto m = Reinterpret1DTo2D::get_square_size(len);
+            return m * m;
+        }
+    }
+
+    static const int DEFAULT_ALIGN_NBYTE = 128;
+
+    template <int NUM>
+    static inline bool is_aligned_at(const void* ptr)
+    {  //
+        return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+    };
+
+    template <typename T, int NUM = DEFAULT_ALIGN_NBYTE>
+    static size_t get_aligned_nbyte(size_t len)
+    {
+        return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+    }
+};
+
+// sparsity rate is less that 5%
+struct SparseMethodSetup {
+    // "Density" denotes the degree of non-zeros (nz).
+    static constexpr float default_density  = 0.25;                 // ratio of nonzeros (R_nz)
+    static constexpr float default_sparsity = 1 - default_density;  // ratio of zeros, 1 - R_nz
+
+    static constexpr int default_density_factor = 4;  // ratio of nonzeros (R_nz)
+
+    template <typename T, typename M = int>
+    static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz)
+    {
+        auto m     = Reinterpret1DTo2D::get_square_size(len);
+        auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz;
+        return nbyte;
+    }
+};
+
+struct HuffmanHelper {
+    // deprecated
+    // template <typename SYM, typename BOOK>
+    // static uint32_t get_revbook_nbyte(int dict_size)
+    // {
+    //     constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8;
+    //     return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size;
+    // }
+
+    static const int BLOCK_DIM_ENCODE  = 256;
+    static const int BLOCK_DIM_DEFLATE = 256;
+
+    static const int ENC_SEQUENTIALITY = 4;  // empirical
+    static const int DEFLATE_CONSTANT  = 4;  // TODO -> deflate_chunk_constant
+};
+
+struct StringHelper {
+    static std::string nnz_percentage(uint32_t nnz, uint32_t data_len)
+    {
+        return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)";
+    }
+};
+
+struct ConfigHelper {
+    static uint32_t predictor_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t codec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"huffman-coarse", 0}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static uint32_t spcodec_lookup(std::string name)
+    {
+        const std::unordered_map<std::string, uint32_t> lut = {
+            {"spmat", 0}, {"spvec", 1}  //
+        };
+        if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name);
+        return lut.at(name);
+    }
+
+    static std::string get_default_predictor() { return "lorenzo"; }
+    static std::string get_default_spcodec() { return "csr11"; }
+    static std::string get_default_codec() { return "huffman-coarse"; }
+    static std::string get_default_cuszmode() { return "r2r"; }
+    static std::string get_default_dtype() { return "f32"; }
+
+    static bool check_predictor(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "lorenzo") or (val == "spline3");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_predictor().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_codec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "huffman-coarse");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"huffman-coarse\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_spcodec(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "csr11") or (val == "rle");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`codec` must be \"csr11\" or \"rle\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_codec().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_cuszmode(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "r2r") or (val == "abs");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`mode` must be \"r2r\" or \"abs\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_cuszmode().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_dtype(const std::string& val, bool fatal = false)
+    {
+        auto legal = (val == "f32");
+        // auto legal = (val == "f32") or (val == "f64");
+        if (not legal) {
+            if (fatal)
+                throw std::runtime_error("`dtype` must be \"f32\".");
+            else
+                printf("fallback to the default \"%s\".", get_default_dtype().c_str());
+        }
+        return legal;
+    }
+
+    static bool check_opt_in_list(std::string const& opt, std::vector<std::string> vs)
+    {
+        for (auto& i : vs) {
+            if (opt == i) return true;
+        }
+        return false;
+    }
+
+    static void parse_length_literal(const char* str, std::vector<std::string>& dims)
+    {
+        std::stringstream data_len_ss(str);
+        auto              data_len_literal = data_len_ss.str();
+        char              delimiter        = 'x';
+
+        while (data_len_ss.good()) {
+            std::string substr;
+            std::getline(data_len_ss, substr, delimiter);
+            dims.push_back(substr);
+        }
+    }
+
+    static size_t get_filesize(std::string fname)
+    {
+        std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary);
+        return in.tellg();
+    }
+
+    static size_t get_filesize(cusz_header* h)
+    {
+        auto END = sizeof(h->entry) / sizeof(h->entry[0]);
+        return h->entry[END - 1];
+    }
+
+    static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; }
+
+    template <typename T1, typename T2>
+    static size_t get_npart(T1 size, T2 subsize)
+    {
+        static_assert(
+            std::numeric_limits<T1>::is_integer and std::numeric_limits<T2>::is_integer,
+            "[get_npart] must be plain interger types.");
+
+        return (size + subsize - 1) / subsize;
+    }
+
+    // #ifdef __CUDACC__
+    static int get_ndim(dim3 len3)
+    {
+        auto ndim = 3;
+        if (len3.z == 1) ndim = 2;
+        if (len3.z == 1 and len3.y == 1) ndim = 1;
+        return ndim;
+    }
+
+    static dim3 get_pardeg3(dim3 len3, dim3 sublen3)
+    {
+        return dim3(
+            get_npart(len3.x, sublen3.x),  //
+            get_npart(len3.y, sublen3.y),  //
+            get_npart(len3.z, sublen3.z));
+    }
+
+    template <typename T>
+    static dim3 get_pardeg3(dim3 len3, T sublen3[3])
+    {
+        return dim3(
+            get_npart(len3.x, sublen3[0]),  //
+            get_npart(len3.y, sublen3[1]),  //
+            get_npart(len3.z, sublen3[2]));
+    }
+
+    template <typename T>
+    static dim3 multiply_dim3(dim3 a, T b[3])
+    {
+        return dim3(a.x * b[0], a.y * b[1], a.z * b[2]);
+    }
+
+    static dim3 multiply_dim3(dim3 a, dim3 b)
+    {  //
+        return dim3(a.x * b.x, a.y * b.y, a.z * b.z);
+    }
+
+    static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; }
+
+    static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); }
+
+    // #endif
+
+    template <typename T>
+    static size_t get_serialized_len(T a[3])
+    {  //
+        return a[0] * a[1] * a[2];
+    }
+};
+
+struct CompareHelper {
+    template <typename TRIO>
+    static bool eq(TRIO a, TRIO b)
+    {
+        return (a.x == b.x) and (a.y == b.y) and (a.z == b.z);
+    };
+};
+
+struct ReportHelper {
+    static float get_throughput(float milliseconds, size_t nbyte)
+    {
+        auto GiB     = 1.0 * 1024 * 1024 * 1024;
+        auto seconds = milliseconds * 1e-3;
+        return nbyte / GiB / seconds;
+    }
+
+    static void println_throughput(const char* s, float timer, size_t _nbyte)
+    {
+        if (timer == 0.0) return;
+        auto t = get_throughput(timer, _nbyte);
+        printf("  %-12s %'12f %'10.2f\n", s, timer, t);
+    };
+
+    static void println_throughput_tablehead()
+    {
+        printf(
+            "\n  \e[1m\e[31m%-12s %12s %10s\e[0m\n",  //
+            const_cast<char*>("kernel"),              //
+            const_cast<char*>("time, ms"),            //
+            const_cast<char*>("GiB/s")                //
+        );
+    }
+
+    static void print_datasegment_tablehead()
+    {
+        printf(
+            "\ndata segments:\n  \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n",  //
+            const_cast<char*>("name"),                                        //
+            const_cast<char*>("nbyte"),                                       //
+            const_cast<char*>("start"),                                       //
+            const_cast<char*>("end"));
+    }
+
+    static std::string demangle(const char* name)
+    {
+        int   status = -4;
+        char* res    = abi::__cxa_demangle(name, nullptr, nullptr, &status);
+
+        const char* const demangled_name = (status == 0) ? res : name;
+        std::string       ret_val(demangled_name);
+        free(res);
+        return ret_val;
+    };
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/definition.hh b/qtensor/compression/cusz/include/common/definition.hh
new file mode 100644
index 00000000..af30239b
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/definition.hh
@@ -0,0 +1,66 @@
+/**
+ * @file definition.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-20
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMMON_DEFINITION_HH
+#define CUSZ_COMMON_DEFINITION_HH
+
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+namespace cusz {
+
+enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class DEV { TEST, DEV, RELEASE };
+enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER };
+enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN };
+enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN };
+enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B };
+
+// TODO when to use ADDR8?
+// TODO change to `enum class`
+enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA };
+
+enum class execution { cuda, serial };
+enum class method { native, thrust };
+
+struct OK {
+    template <cusz::DEV m>
+    static void ALLOC()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+
+    template <cusz::DEV m>
+    static void FREE()
+    {
+        static_assert(
+            m == cusz::DEV::TEST or m == cusz::DEV::DEV,  //
+            "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution");
+    }
+};
+
+using ADDR4 = uint32_t;
+using ADDR8 = size_t;
+
+using FREQ = uint32_t;
+
+using TimeRecordTuple = std::tuple<const char*, double>;
+using TimeRecord      = std::vector<TimeRecordTuple>;
+using timerecord_t    = TimeRecord*;
+
+using BYTE = uint8_t;
+
+};  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/common/type_traits.hh b/qtensor/compression/cusz/include/common/type_traits.hh
new file mode 100644
index 00000000..3d623beb
--- /dev/null
+++ b/qtensor/compression/cusz/include/common/type_traits.hh
@@ -0,0 +1,108 @@
+/**
+ * @file type_traits.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.1.1
+ * @date 2020-09-23
+ * (create) 2020-09-23, (rev) 2021-09-17
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef TYPE_TRAITS_HH
+#define TYPE_TRAITS_HH
+
+#include <stdexcept>
+#include <type_traits>
+
+#include "cusz/type.h"
+#include "definition.hh"
+
+template <typename T>
+cusz_datatype cusz_typeof()
+{
+    if (std::is_same<T, float>::value)
+        return FP32;
+    else if (std::is_same<T, double>::value)
+        return FP64;
+    else
+        throw std::runtime_error("Type not supported.");
+}
+
+// clang-format off
+
+/**
+ * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, 
+ * `unsigned long long`, which is ambiguous anyway.
+ */
+template <typename T> struct cuszCOMPAT;
+template <> struct cuszCOMPAT<uint32_t> { using type = uint32_t; };
+template <> struct cuszCOMPAT<uint64_t> { using type = unsigned long long; };
+
+template <int WIDTH, bool FP = true> struct DataTrait;
+template <> struct DataTrait<4, true>  { typedef float   type; };
+template <> struct DataTrait<8, true>  { typedef double  type; };
+template <> struct DataTrait<1, false> { typedef int8_t  type; }; // future use
+template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use
+template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use
+template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use
+
+template <int NDIM> struct ChunkingTrait;
+template <> struct ChunkingTrait<1>     { static const int BLOCK = 256; static const int SEQ = 8; };
+template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; };
+template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64;  };
+template <> struct ChunkingTrait<2>     { static const int BLOCK = 16; static const int YSEQ = 8; };
+template <> struct ChunkingTrait<3>     { static const int BLOCK = 8;  static const int YSEQ = 8; };
+
+// template <int WIDTH> struct QuantTrait;
+// template <> struct QuantTrait<1> { typedef uint8_t type; };
+// template <> struct QuantTrait<2> { typedef uint16_t type; };
+// template <> struct QuantTrait<4> { typedef uint32_t type; };
+
+template <int WIDTH, bool FP = false> struct ErrCtrlTrait;
+template <> struct ErrCtrlTrait<1, false> { typedef uint8_t  type; };
+template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; };
+template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; };
+template <> struct ErrCtrlTrait<4, true>  { typedef float    type; };
+template <> struct ErrCtrlTrait<8, true>  { typedef double   type; };
+
+template <int WIDTH> struct HuffTrait;
+template <> struct HuffTrait<4> { typedef cuszCOMPAT<uint32_t>::type type; };
+template <> struct HuffTrait<8> { typedef cuszCOMPAT<uint64_t>::type type; };
+
+template <int WIDTH> struct ReducerTrait;
+template <> struct ReducerTrait<4> { typedef uint32_t type; };
+template <> struct ReducerTrait<8> { typedef uint64_t type; };
+
+template <int WIDTH> struct MetadataTrait;
+template <> struct MetadataTrait<4> { typedef uint32_t type; };
+template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use
+
+template <bool LARGE> struct LargeInputTrait;
+template <> struct LargeInputTrait<false> { using type = MetadataTrait<4>::type; };
+template <> struct LargeInputTrait<true>  { using type = MetadataTrait<8>::type; };
+
+template <bool FAST> struct FastLowPrecisionTrait;
+template <> struct FastLowPrecisionTrait<true>  { typedef float  type; };
+template <> struct FastLowPrecisionTrait<false> { typedef double type; };
+
+// template <typename F> struct cuszCUSPARSE;
+// template <> struct cuszCUSPARSE<float>  { const static cudaDataType type = CUDA_R_32F; };
+// template <> struct cuszCUSPARSE<double> { const static cudaDataType type = CUDA_R_64F; };
+
+#ifdef __CUDACC__
+#include <driver_types.h>
+
+template <cusz::LOC FROM, cusz::LOC TO> struct CopyDirection;
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyHostToHost;     };
+template <> struct CopyDirection<cusz::LOC::HOST,   cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::HOST>   { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost;   };
+template <> struct CopyDirection<cusz::LOC::DEVICE, cusz::LOC::DEVICE> { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; };
+
+#endif
+
+// clang-format on
+
+#endif
diff --git a/qtensor/compression/cusz/include/compaction.hh b/qtensor/compression/cusz/include/compaction.hh
new file mode 100644
index 00000000..4a21f571
--- /dev/null
+++ b/qtensor/compression/cusz/include/compaction.hh
@@ -0,0 +1,18 @@
+/**
+ * @file compaction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DAB40B13_9236_42A9_8047_49CD896671C9
+#define DAB40B13_9236_42A9_8047_49CD896671C9
+
+template <typename T>
+struct CompactionDRAM;
+
+#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */
diff --git a/qtensor/compression/cusz/include/component.hh b/qtensor/compression/cusz/include/component.hh
new file mode 100644
index 00000000..34fb8e00
--- /dev/null
+++ b/qtensor/compression/cusz/include/component.hh
@@ -0,0 +1,19 @@
+/**
+ * @file componment.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-10-06
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HH
+#define CUSZ_COMPONENT_HH
+
+#include "component/prediction.inl"
+#include "component/spcodec.inl"
+#include "hf/hf.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/component/glue.cuh b/qtensor/compression/cusz/include/component/glue.cuh
new file mode 100644
index 00000000..cdcc8ff0
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/glue.cuh
@@ -0,0 +1,120 @@
+/**
+ * @file glue.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef WRAPPER_GLUE_CUH
+#define WRAPPER_GLUE_CUH
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include "spcodec.hh"
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename E>
+struct cleanup : public thrust::unary_function<E, E> {
+    int radius;
+    cleanup(int radius) : radius(radius) {}
+    __host__ __device__ E operator()(const E e) const { return e; }
+};
+
+template <typename E, typename Policy, typename IDX = int, bool SHIFT = true>
+void split_by_radius(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+
+    // clear up
+    cleanup<E> functor(radius);
+    thrust::transform(
+        policy,                                                                      //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz),  //
+        thrust::make_permutation_iterator(in_errctrl, out_idx),                      //
+        functor);
+}
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_twopass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<IDX> zero(0);
+
+    // find out the indices
+    out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        policy, thrust::make_permutation_iterator(in_errctrl, out_idx),
+        thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val);
+}
+
+// when using nvcc, functors must be defined outside a (__host__) function
+template <typename Tuple>
+struct is_outlier {
+    int radius;
+    is_outlier(int radius) : radius(radius) {}
+    __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; }
+};
+
+template <typename E, typename Policy, typename IDX = int>
+void split_by_binary_onepass(
+    E*           in_errctrl,
+    size_t       in_len,
+    int const    radius,
+    IDX*         out_idx,
+    E*           out_val,
+    int&         out_nnz,
+    cudaStream_t stream = nullptr,
+    Policy       policy = thrust::device)
+{
+    thrust::cuda::par.on(stream);
+    using Tuple = thrust::tuple<IDX, E>;
+    thrust::counting_iterator<IDX> zero(0);
+
+    auto in      = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl));
+    auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len));
+    auto out     = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val));
+
+    is_outlier<Tuple> functor(radius);
+    out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out;
+}
+
+enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS };
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
new file mode 100644
index 00000000..f83c25cd
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh
@@ -0,0 +1,210 @@
+/**
+ * @file predictor_boilerplate.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_INCLUDE_PREDICTOR_HH
+#define CUSZ_INCLUDE_PREDICTOR_HH
+
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+
+#include "../common/configs.hh"
+#include "../cusz/type.h"
+
+namespace cusz {
+
+class PredictorBoilerplate {
+   protected:
+    struct DerivedLengths {
+        struct Interpretion3D {
+            dim3   len3, leap;
+            size_t serialized;
+
+            void set_leap() { leap = ConfigHelper::get_leap(len3); }
+            void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); }
+        };
+
+        struct Interpretion3D base, anchor, aligned;
+
+        dim3 nblock;
+        int  ndim;
+
+        struct {
+            size_t data, quant, outlier, anchor;
+        } assigned;
+
+        dim3 get_len3() const { return base.len3; }
+        dim3 get_leap() const { return base.leap; }
+    };
+
+    template <class DERIVED>
+    void __derive_len(dim3 base, DERIVED& derived)
+    {
+        int sublen[3]      = {1, 1, 1};
+        int anchor_step[3] = {1, 1, 1};
+        __derive_len(base, derived, sublen, anchor_step, false);
+    }
+
+    template <class DERIVED>
+    void
+    __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false)
+    {
+        derived.base.len3 = base;
+        derived.base.set_leap();
+        derived.base.set_serialized();
+        derived.ndim = ConfigHelper::get_ndim(base);
+
+        if (not use_anchor) {
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.base.serialized;
+            derived.assigned.outlier = derived.base.serialized;
+            derived.assigned.anchor  = 0;
+        }
+        else {
+            derived.nblock = ConfigHelper::get_pardeg3(base, sublen3);
+
+            derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3);
+            derived.aligned.set_leap();
+            derived.aligned.set_serialized();
+
+            derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3);
+            derived.anchor.set_leap();
+            derived.anchor.set_serialized();
+
+            derived.assigned.data    = derived.base.serialized;
+            derived.assigned.quant   = derived.aligned.serialized;
+            derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized);  // TODO
+            derived.assigned.anchor  = derived.anchor.serialized;
+        }
+    }
+
+    template <class DERIVED, typename T, typename E, typename FP = float>
+    void __debug_list_derived(DERIVED const& derived, bool use_anchor = false)
+    {
+        auto base    = derived.base;
+        auto aligned = derived.aligned;
+        auto anchor  = derived.anchor;
+        auto nblock  = derived.nblock;
+
+        printf("%-*s:  (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP));
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z);
+        printf("%-*s:  (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z);
+        printf("%-*s:  %'zu\n", 16, "base.serial", base.serialized);
+
+        if (use_anchor) {
+            printf("%-*s:  (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z);
+            printf("%-*s:  %'zu\n", 16, "aligned.serial", aligned.serialized);
+
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z);
+            printf("%-*s:  (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z);
+            printf("%-*s:  %'zu\n", 16, "anchor.serial", anchor.serialized);
+        }
+
+        printf("%-*s:  %'zu\n", 16, "len.data", derived.assigned.data);
+        printf("%-*s:  %'zu\n", 16, "len.quant", derived.assigned.quant);
+        printf("%-*s:  %'zu\n", 16, "len.outlier", derived.assigned.outlier);
+        printf("%-*s:  %'zu\n", 16, "len.anchor", derived.assigned.anchor);
+    }
+
+    void check_rtlen()
+    {
+        auto rtlen3    = rtlen.get_len3();
+        auto alloclen3 = alloclen.get_len3();
+
+        if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or
+            rtlen.base.serialized > alloclen.base.serialized)
+            throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths.");
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_alloclen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing allocation lengths:\n");
+        __debug_list_derived<decltype(alloclen), T, E, FP>(alloclen, use_anchor);
+    }
+
+    template <typename T, typename E, typename FP = float>
+    void debug_list_rtlen(bool use_anchor = false)
+    {
+        printf("\ndebugging, listing runtime lengths:\n");
+        __debug_list_derived<decltype(rtlen), T, E, FP>(rtlen, use_anchor);
+    }
+
+   protected:
+    struct DerivedLengths alloclen, rtlen;
+
+    float time_elapsed;
+
+    // -----------------------------------------------------------------------------
+    //                                  accessor
+    // -----------------------------------------------------------------------------
+   public:
+    // helper
+    size_t get_alloclen_data() const { return alloclen.assigned.data; }
+    size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; }
+    size_t get_alloclen_quant() const { return alloclen.assigned.quant; }
+    size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; }
+
+    dim3   get_len3() const { return rtlen.base.len3; }
+    dim3   get_leap3() const { return rtlen.base.leap; }
+    size_t get_len_data() const { return rtlen.assigned.data; }
+    size_t get_len_anchor() const { return rtlen.assigned.anchor; }
+    size_t get_len_quant() const { return rtlen.assigned.quant; }
+    size_t get_len_outlier() const { return rtlen.assigned.outlier; }
+
+    float get_time_elapsed() const { return time_elapsed; }
+
+    size_t get_x() const { return this->rtlen.get_len3().x; }
+    size_t get_y() const { return this->rtlen.get_len3().y; }
+    size_t get_z() const { return this->rtlen.get_len3().z; }
+
+    dim3 get_leap() const { return this->rtlen.get_leap(); }
+    int  get_ndim() const { return this->rtlen.ndim; }
+
+    void derive_alloclen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->alloclen);
+        }
+
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->alloclen, sublen, anchor_step, true);
+        }
+    }
+
+    void derive_rtlen(cusz_predictortype predictor, dim3 base)
+    {
+        if (predictor == LorenzoI) {
+            // normal
+            this->__derive_len(base, this->rtlen);
+        }
+        else if (predictor == Spline3) {
+            // maximum possible
+            int sublen[3]      = {32, 8, 8};
+            int anchor_step[3] = {8, 8, 8};
+            this->__derive_len(base, this->rtlen, sublen, anchor_step, true);
+        }
+    }
+
+    // "real" methods
+    virtual ~PredictorBoilerplate() = default;
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/component/prediction.inl b/qtensor/compression/cusz/include/component/prediction.inl
new file mode 100644
index 00000000..50091ae1
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/prediction.inl
@@ -0,0 +1,193 @@
+/**
+ * @file prediction.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1
+#define FB315D3E_6B96_4F5D_9975_F35702205BC1
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+#include "../common.hh"
+#include "../kernel/cpplaunch_cuda.hh"
+#include "../kernel/lorenzo_all.hh"
+#include "../utils.hh"
+
+#include "cusz/type.h"
+#include "pred_boilerplate_deprecated.hh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define ALLOCDEV(VAR, SYM, NBYTE)                    \
+    if (NBYTE != 0) {                                \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \
+    }
+
+#define ALLOCDEV2(VAR, TYPE, LEN)                                 \
+    if (LEN != 0) {                                               \
+        CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN));     \
+        CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \
+    }
+
+#define FREE_DEV_ARRAY(VAR)            \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+namespace cusz {
+
+template <typename T, typename E, typename FP>
+class PredictionUnified : public PredictorBoilerplate {
+   public:
+    using Origin    = T;
+    using Anchor    = T;
+    using ErrCtrl   = E;
+    using Precision = FP;
+
+   public:
+    ~PredictionUnified()
+    {  // dtor
+        FREE_DEV_ARRAY(anchor);
+        FREE_DEV_ARRAY(errctrl);
+        FREE_DEV_ARRAY(outlier);
+    }
+    PredictionUnified() {}                                   // ctor
+    PredictionUnified(const PredictionUnified&);             // copy ctor
+    PredictionUnified& operator=(const PredictionUnified&);  // copy assign
+    PredictionUnified(PredictionUnified&&);                  // move ctor
+    PredictionUnified& operator=(PredictionUnified&&);       // move assign
+
+    void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false)
+    {
+        auto len3 = dim3(x, y, z);
+        init(predictor, len3, dbg_print);
+    }
+    void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false)
+    {
+        this->derive_alloclen(predictor, xyz);
+
+        // allocate
+        ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor);
+        ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant);
+        ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier);
+
+        if (dbg_print) this->debug_list_alloclen<T, E, FP>();
+    }
+
+    void construct(
+        cusz_predictortype predictor,
+        dim3 const         len3,
+        T*                 data,
+        T**                ptr_anchor,
+        E**                ptr_errctrl,
+        T**                ptr_outlier,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        *ptr_anchor  = d_anchor;
+        *ptr_errctrl = d_errctrl;
+        *ptr_outlier = d_outlier;
+
+        if (predictor == LorenzoI) {
+            derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            uint32_t* outlier_idx = nullptr;
+
+            compress_predict_lorenzo_i<T, E, FP>(
+                data, len3, eb, radius,                      //
+                d_errctrl, d_outlier, outlier_idx, nullptr,  //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+
+            cusz::cpplaunch_construct_Spline3<T, E, FP>(
+                true,  //
+                data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void reconstruct(
+        cusz_predictortype predictor,
+        dim3               len3,
+        T*                 outlier_xdata,
+        T*                 anchor,
+        E*                 errctrl,
+        double const       eb,
+        int const          radius,
+        cudaStream_t       stream)
+    {
+        if (predictor == LorenzoI) {
+            this->derive_rtlen(LorenzoI, len3);
+            this->check_rtlen();
+
+            // ad hoc placeholder
+            // auto      anchor_len3  = dim3(0, 0, 0);
+            // auto      errctrl_len3 = dim3(0, 0, 0);
+            auto      xdata       = outlier_xdata;
+            auto      outlier     = outlier_xdata;
+            uint32_t* outlier_idx = nullptr;
+
+            auto xdata_len3 = len3;
+
+            decompress_predict_lorenzo_i<T, E, FP>(
+                errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius,  //
+                xdata,                                                     //
+                &time_elapsed, stream);
+        }
+        else if (predictor == Spline3) {
+            this->derive_rtlen(Spline3, len3);
+            this->check_rtlen();
+            // this->debug_list_rtlen<T, E, FP>(true);
+
+            // launch_reconstruct_Spline3<T, E, FP>(
+            cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(
+                outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius,
+                &time_elapsed, stream);
+        }
+    }
+
+    void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); }
+
+    float get_time_elapsed() const { return time_elapsed; }
+    // size_t get_alloclen_data() const;
+    // size_t get_alloclen_quant() const;
+    // size_t get_len_data() const;
+    // size_t get_len_quant() const;
+    // size_t get_len_anchor() const;
+
+    E* expose_quant() const { return d_errctrl; }
+    E* expose_errctrl() const { return d_errctrl; }
+    T* expose_anchor() const { return d_anchor; }
+    T* expose_outlier() const { return d_outlier; }
+
+   public:
+    // data
+    DEFINE_ARRAY(anchor, T);
+    DEFINE_ARRAY(errctrl, E);
+    DEFINE_ARRAY(outlier, T);
+};
+
+}  // namespace cusz
+
+#undef ALLOCDEV
+#undef FREE_DEV_ARRAY
+#undef DEFINE_ARRAY
+
+#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */
diff --git a/qtensor/compression/cusz/include/component/spcodec.inl b/qtensor/compression/cusz/include/component/spcodec.inl
new file mode 100644
index 00000000..2a57f2f1
--- /dev/null
+++ b/qtensor/compression/cusz/include/component/spcodec.inl
@@ -0,0 +1,218 @@
+/**
+ * @file spcodec_vec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44
+#define CF358238_3946_4FFC_B5E6_45C12F0C0B44
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include "../common.hh"
+#include "../kernel/spv_gpu.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr};
+
+#define SPVEC_ALLOCDEV(VAR, SYM)                           \
+    CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \
+    CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]));
+
+#define SPVEC_FREEDEV(VAR)             \
+    if (d_##VAR) {                     \
+        CHECK_CUDA(cudaFree(d_##VAR)); \
+        d_##VAR = nullptr;             \
+    }
+
+#define SPVEC_D2DCPY(VAR, FIELD)                                                                       \
+    {                                                                                                  \
+        auto dst = d_spfmt + header.entry[Header::FIELD];                                              \
+        auto src = reinterpret_cast<BYTE*>(d_##VAR);                                                   \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+namespace cusz {
+
+/*******************************************************************************
+ * sparsity-aware coder/decoder, vector
+ *******************************************************************************/
+
+template <typename T, typename M = uint32_t>
+class SpcodecVec {
+   public:
+    using Origin    = T;
+    using BYTE      = uint8_t;
+    using MetadataT = M;
+
+    struct alignas(128) Header {
+        static const int HEADER = 0;
+        static const int IDX    = 1;
+        static const int VAL    = 2;
+        static const int END    = 3;
+
+        int       self_bytes : 16;
+        size_t    uncompressed_len;
+        int       nnz;
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int SPFMT = 0;
+        static const int IDX   = 1;
+        static const int VAL   = 2;
+        static const int END   = 3;
+
+        uint32_t nbyte[END];
+        int      nnz{0};
+    };
+
+   private:
+    DEFINE_ARRAY(spfmt, BYTE);
+    DEFINE_ARRAY(idx, M);
+    DEFINE_ARRAY(val, T);
+
+    using RTE = runtime_encode_helper;
+
+    float milliseconds{0.0};
+
+    RTE rte;
+
+   public:
+    ~SpcodecVec()
+    {
+        SPVEC_FREEDEV(spfmt);
+        SPVEC_FREEDEV(idx);
+        SPVEC_FREEDEV(val);
+    }                                          // dtor
+    SpcodecVec() {}                            // ctor
+    SpcodecVec(const SpcodecVec&);             // copy ctor
+    SpcodecVec& operator=(const SpcodecVec&);  // copy assign
+    SpcodecVec(SpcodecVec&&);                  // move ctor
+    SpcodecVec& operator=(SpcodecVec&&);       // move assign
+
+    void init(size_t const len, int density_factor = 4, bool dbg_print = false)
+    {
+        auto max_bytes = [&]() { return len / density_factor * sizeof(T); };
+        auto init_nnz  = [&]() { return len / density_factor; };
+
+        memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+        rte.nnz = init_nnz();
+
+        rte.nbyte[RTE::SPFMT] = max_bytes();
+        rte.nbyte[RTE::IDX]   = rte.nnz * sizeof(int);
+        rte.nbyte[RTE::VAL]   = rte.nnz * sizeof(T);
+
+        SPVEC_ALLOCDEV(spfmt, SPFMT);
+        SPVEC_ALLOCDEV(idx, IDX);
+        SPVEC_ALLOCDEV(val, VAL);
+
+        // if (dbg_print) debug();
+    }
+
+    void encode(
+        T*           in,
+        size_t const in_len,
+        BYTE*&       out,
+        size_t&      out_len,
+        cudaStream_t stream    = nullptr,
+        bool         dbg_print = false)
+    {
+        Header header;
+
+        psz::spv_gather<T, M>(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream);
+
+        subfile_collect(header, in_len, stream, dbg_print);
+        out     = d_spfmt;
+        out_len = header.subfile_size();
+    }
+
+    void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr)
+    {
+        Header header;
+        CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(coded + header.entry[Header::SYM])
+        auto d_idx = ACCESSOR(IDX, uint32_t);
+        auto d_val = ACCESSOR(VAL, T);
+#undef ACCESSOR
+
+        psz::spv_scatter<T, M>(d_val, d_idx, header.nnz, decoded, &milliseconds, stream);
+    }
+
+    void clear_buffer()
+    {
+        cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]);
+        cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]);
+        cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]);
+    }
+
+    float get_time_elapsed() const { return milliseconds; }
+
+    void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print)
+    {
+        header.self_bytes       = sizeof(Header);
+        header.uncompressed_len = len;
+        header.nnz              = rte.nnz;
+
+        // update (redundant here)
+        rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz;
+        rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz;
+
+        MetadataT nbyte[Header::END];
+        nbyte[Header::HEADER] = 128;
+        nbyte[Header::IDX]    = rte.nbyte[RTE::IDX];
+        nbyte[Header::VAL]    = rte.nbyte[RTE::VAL];
+
+        header.entry[0] = 0;
+        // *.END + 1; need to knwo the ending position
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+        for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        auto debug_header_entry = [&]() {
+            printf("\nCSR11::subfile_collect() debugging:\n");
+            printf("%-*s:  %'10ld\n", 16, "final.nnz", rte.nnz);
+            printf("  ENTRIES\n");
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+            PRINT_ENTRY(HEADER);
+            PRINT_ENTRY(IDX);
+            PRINT_ENTRY(VAL);
+            PRINT_ENTRY(END);
+            printf("\n");
+#undef PRINT_ENTRY
+        };
+        if (dbg_print) debug_header_entry();
+
+        CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        SPVEC_D2DCPY(idx, IDX)
+        SPVEC_D2DCPY(val, VAL)
+
+        /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+#undef SPVEC_ALLOCDEV
+#undef SPVEC_FREEDEV
+#undef SPVEC_D2DCPY
+
+#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */
diff --git a/qtensor/compression/cusz/include/compressor.hh b/qtensor/compression/cusz/include/compressor.hh
new file mode 100644
index 00000000..adea8f57
--- /dev/null
+++ b/qtensor/compression/cusz/include/compressor.hh
@@ -0,0 +1,165 @@
+/**
+ * @file compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPRESSOR_HH
+#define CUSZ_COMPRESSOR_HH
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+#define PUBLIC_TYPES                                                   \
+    using Predictor     = typename BINDING::Predictor;                 \
+    using Spcodec       = typename BINDING::Spcodec;                   \
+    using Codec         = typename BINDING::Codec;                     \
+    using FallbackCodec = typename BINDING::FallbackCodec;             \
+    using BYTE          = uint8_t;                                     \
+                                                                       \
+    using T    = typename BINDING::DATA;                               \
+    using FP   = typename BINDING::FP;                                 \
+    using E    = typename BINDING::ERRCTRL;                            \
+    using H    = typename Codec::Encoded;                              \
+    using M    = typename Codec::MetadataT;                            \
+    using H_FB = typename FallbackCodec::Encoded;                      \
+                                                                       \
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>; \
+    using timerecord_t = TimeRecord*;
+
+namespace cusz {
+
+// extra helper
+struct CompressorHelper {
+    static int autotune_coarse_parvle(Context* ctx);
+};
+
+template <class BINDING>
+class Compressor {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~Compressor();
+    Compressor();
+    Compressor(const Compressor&);
+    Compressor& operator=(const Compressor&);
+    Compressor(Compressor&&);
+    Compressor& operator=(Compressor&&);
+
+    // methods
+    void init(Context*, bool dbg_print = false);
+    void init(Header*, bool dbg_print = false);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(Header&);
+    void export_header(Header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class BINDING>
+class Compressor<BINDING>::impl {
+   public:
+    using Predictor     = typename BINDING::Predictor;
+    using Spcodec       = typename BINDING::Spcodec;
+    using Codec         = typename BINDING::Codec;
+    using FallbackCodec = typename BINDING::FallbackCodec;
+    using BYTE          = uint8_t;
+
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+    using H    = typename Codec::Encoded;
+    using M    = typename Codec::MetadataT;
+    using H_FB = typename FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    bool  use_fallback_codec{false};
+    bool  fallback_codec_allocated{false};
+    BYTE* d_reserved_compressed{nullptr};
+    // profiling
+    TimeRecord timerecord;
+    // header
+    Header header;
+    // components
+
+    Predictor*     predictor;
+    Spcodec*       spcodec;
+    Codec*         codec;
+    FallbackCodec* fb_codec;
+    // variables
+    uint32_t* d_freq;
+    float     time_hist;
+    dim3      data_len3;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config, bool dbg_print = false);
+    void init(Header* config, bool dbg_print = false);
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    // getter
+    void     export_header(Header&);
+    void     export_header(Header*);
+    void     export_timerecord(TimeRecord*);
+    uint32_t get_len_data();
+
+   private:
+    // helper
+    template <class CONFIG>
+    void init_detail(CONFIG*, bool);
+    void init_codec(size_t, unsigned int, int, int, bool);
+    void collect_compress_timerecord();
+    void collect_decompress_timerecord();
+    void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool);
+    void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool);
+    void destroy();
+    // getter
+};
+
+}  // namespace cusz
+
+#undef PUBLIC_TYPES
+
+#endif
diff --git a/qtensor/compression/cusz/include/context.hh b/qtensor/compression/cusz/include/context.hh
new file mode 100644
index 00000000..d177fb8f
--- /dev/null
+++ b/qtensor/compression/cusz/include/context.hh
@@ -0,0 +1,251 @@
+#ifndef ARGPARSE_HH
+#define ARGPARSE_HH
+
+/**
+ * @file argparse.hh
+ * @author Jiannan Tian
+ * @brief Argument parser (header).
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cstdlib>
+#include <iostream>
+#include <regex>
+#include <string>
+
+#include "common/configs.hh"
+#include "common/definition.hh"
+#include "utils/format.hh"
+#include "utils/strhelper.hh"
+
+namespace cusz {
+
+extern const char* VERSION_TEXT;
+extern const int   version;
+extern const int   compatibility;
+
+}  // namespace cusz
+
+struct cuszCTX {
+   public:
+    // on-off's
+    struct {
+        bool construct{false}, reconstruct{false}, dryrun{false};
+        bool experiment{false};
+        bool gtest{false};
+    } cli_task;
+
+    struct {
+        bool binning{false}, logtransform{false}, prescan{false};
+    } preprocess;
+    struct {
+        bool gpu_nvcomp_cascade{false}, cpu_gzip{false};
+    } postcompress;
+
+    struct {
+        bool predefined_demo{false}, release_input{false};
+        bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false};
+    } use;
+
+    struct {
+        bool book{false}, quant{false};
+    } export_raw;
+
+    struct {
+        bool write2disk{false}, huffman{false};
+    } skip;
+    struct {
+        bool time{false}, cr{false}, compressibility{false};
+    } report;
+
+    // filenames
+    struct {
+        std::string fname, origin_cmp, path_basename, basename, compress_output;
+    } fname;
+
+    bool verbose{false};
+
+    // Stat stat;
+
+    int read_args_status{0};
+
+    std::string opath;
+
+    std::string demo_dataset;
+    std::string dtype     = ConfigHelper::get_default_dtype();      // "f32"
+    std::string mode      = ConfigHelper::get_default_cuszmode();   // "r2r"
+    std::string predictor = ConfigHelper::get_default_predictor();  // "lorenzo"
+    std::string codec     = ConfigHelper::get_default_codec();      // "huffman-coarse"
+    std::string spcodec   = ConfigHelper::get_default_spcodec();    // "cusparse-csr"
+    std::string pipeline  = "auto";
+
+    // sparsity related: init_nnz when setting up Spcodec
+    float nz_density{SparseMethodSetup::default_density};
+    float nz_density_factor{SparseMethodSetup::default_density_factor};
+
+    uint32_t codecs_in_use{0b01};
+
+    uint32_t quant_bytewidth{2}, huff_bytewidth{4};
+
+    bool codec_force_fallback() const { return huff_bytewidth == 8; }
+
+    size_t huffman_num_uints, huffman_num_bits;
+    int    vle_sublen{512}, vle_pardeg{-1};
+
+    unsigned int x{1}, y{1}, z{1}, w{1};
+
+    struct {
+        // size_t x, y, z, w;
+        size_t len;
+    } alloclen;
+
+    size_t data_len{1}, quant_len{1}, anchor_len{1};
+    int    ndim{-1};
+
+    size_t get_len() const { return data_len; }
+
+    double eb{0.0};
+    int    dict_size{1024}, radius{512};
+
+    void load_demo_sizes();
+
+    /*******************************************************************************
+     * another configuration method, alternative to
+     *******************************************************************************/
+   public:
+    // for configuration
+    cuszCTX& set_eb(double _)
+    {
+        eb = _;
+        return *this;
+    }
+
+    cuszCTX& set_radius(int _)
+    {
+        radius    = _;
+        dict_size = radius * 2;
+        return *this;
+    }
+
+    cuszCTX& set_huffbyte(int _)
+    {
+        huff_bytewidth = _;
+        codecs_in_use  = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        return *this;
+    }
+
+    cuszCTX& set_huffchunk(int _)
+    {
+        vle_sublen              = _;
+        use.autotune_vle_pardeg = false;
+        return *this;
+    }
+
+    cuszCTX& set_spcodec_densityfactor(int _)
+    {
+        if (_ <= 1)
+            throw std::runtime_error(
+                "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density "
+                "(the portion of nonzeros) is 25% in an array.");
+        nz_density_factor = _;
+        nz_density        = 1.0 / _;
+        return *this;
+    }
+
+    cuszCTX& enable_anchor(bool _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+    cuszCTX& enable_input_nondestructive(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& enable_failfast(bool _)
+    {
+        // placeholder
+        return *this;
+    }
+
+    cuszCTX& set_alloclen(size_t _)
+    {
+        alloclen.len = _;
+        return *this;
+    }
+
+    cuszCTX& set_control_string(const char* in_str);
+
+    cuszCTX& use_anchor(size_t _)
+    {
+        use.anchor = true;
+        return *this;
+    }
+
+    // set x, y, z, w, ndim, data_len
+    cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1)
+    {
+        x = _x, y = _y, z = _z, w = _w;
+
+        ndim = 4;
+        if (w == 1) ndim = 3;
+        if (z == 1) ndim = 2;
+        if (y == 1) ndim = 1;
+
+        data_len = x * y * z * w;
+
+        if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view).");
+        if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view).");
+
+        return *this;
+    }
+
+   private:
+    void derive_fnames();
+
+    void validate();
+
+   public:
+    void trap(int _status);
+
+    static void print_doc(bool full = false);
+
+   public:
+    static void parse_input_length(const char* lenstr, cuszCTX* ctx)
+    {
+        std::vector<std::string> dims;
+        ConfigHelper::parse_length_literal(lenstr, dims);
+        ctx->ndim = dims.size();
+        ctx->y = ctx->z = ctx->w = 1;
+        ctx->x                   = StrHelper::str2int(dims[0]);
+        if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]);
+        if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]);
+        if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]);
+        ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w;
+    }
+
+   public:
+    cuszCTX() = default;
+
+    cuszCTX(int argc, char** argv);
+
+    cuszCTX(const char*, bool dbg_print = false);
+};
+
+typedef struct cuszCTX cusz_context;
+
+namespace cusz {
+
+using Context   = cusz_context;
+using context_t = cusz_context*;
+
+}  // namespace cusz
+
+#endif  // ARGPARSE_HH
diff --git a/qtensor/compression/cusz/include/cusz.h b/qtensor/compression/cusz/include/cusz.h
new file mode 100644
index 00000000..420999cc
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz.h
@@ -0,0 +1,60 @@
+/**
+ * @file cusz.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+//#define __cplusplus
+//#ifdef __cplusplus
+extern "C" {
+//#endif
+
+#ifndef CUSZ_H
+#define CUSZ_H
+
+#include <stddef.h>
+
+#include "cusz/custom.h"
+#include "cusz/record.h"
+#include "cusz/type.h"
+#include "header.h"
+
+#pragma link C++ all function
+#pragma link C++ all class
+
+cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type);
+
+cusz_error_status cusz_release(cusz_compressor* comp);
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream);
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream);
+
+#endif
+
+//#ifdef __cplusplus
+}
+//#endif
diff --git a/qtensor/compression/cusz/include/cusz/custom.h b/qtensor/compression/cusz/include/cusz/custom.h
new file mode 100644
index 00000000..2ab7706d
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/custom.h
@@ -0,0 +1,26 @@
+/**
+ * @file compress.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "type.h"
+
+cusz_custom_predictor     cusz_default_predictor();
+cusz_custom_codec         cusz_default_codec();
+cusz_custom_huffman_codec cusz_default_huffman_codec();
+cusz_custom_spcodec       cusz_default_spcodec();
+cusz_custom_framework*    cusz_default_framework();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/it.hh b/qtensor/compression/cusz/include/cusz/it.hh
new file mode 100644
index 00000000..5334acde
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/it.hh
@@ -0,0 +1,78 @@
+/**
+ * @file it.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+template <typename T, int DIM, int BLOCK>
+struct psz_buf {
+   private:
+    T*               _buf;
+    size_t           _len{1};
+    static const int stridey{BLOCK};
+    static const int stridez{BLOCK * BLOCK};
+
+   public:
+    psz_buf(bool do_memset = true)
+    {
+        if (DIM == 1) _len = BLOCK;
+        if (DIM == 2) _len = BLOCK * BLOCK;
+        if (DIM == 3) _len = BLOCK * BLOCK * BLOCK;
+        _buf = new T[_len];
+        if (do_memset) memset(_buf, 0x0, sizeof(T) * _len);
+    }
+
+    ~psz_buf() { delete[] _buf; }
+
+    T*& buf() { return _buf; }
+
+    T& operator()(int x) { return _buf[x]; }
+    T& operator()(int x, int y) { return _buf[x + y * stridey]; }
+    T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; }
+};
+
+template <typename T, typename IDX = uint32_t>
+struct psz_outlier_serial {
+   private:
+    T*       _data;
+    IDX*     _idx;
+    uint32_t _count{0};
+    uint32_t _cap;
+
+   public:
+    psz_outlier_serial(size_t cap) : _cap(cap)
+    {
+        _data = new T[cap + 1];
+        _idx  = new IDX[cap + 1];
+        memset(_data, 0x0, sizeof(T) * cap);
+    }
+
+    ~psz_outlier_serial()
+    {
+        delete[] _data;
+        delete[] _idx;
+    }
+
+    T*&            val() { return _data; }
+    IDX*&          idx() { return _idx; }
+    uint32_t const count() { return _count; }
+
+    void record(T data, IDX idx)
+    {
+        if (_count > _cap) throw std::runtime_error("Outlier overflows.");
+        _data[_count] = data;
+        _idx[_count]  = idx;
+        ++_count;
+    }
+};
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/nd.h b/qtensor/compression/cusz/include/cusz/nd.h
new file mode 100644
index 00000000..2c4443bc
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/nd.h
@@ -0,0 +1,15 @@
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct psz_dim3 {
+    uint32_t x, y, z;
+} psz_dim3;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/cusz/pn.hh b/qtensor/compression/cusz/include/cusz/pn.hh
new file mode 100644
index 00000000..9c0f78bf
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/pn.hh
@@ -0,0 +1,49 @@
+/**
+ * @file pn.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-05
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO typing should be more applicable
+
+namespace psz {
+namespace typing {
+
+// clang-format off
+template <int BYTEWIDTH> struct Int;
+template <> struct Int<1> { typedef int8_t  T; }; 
+template <> struct Int<2> { typedef int16_t T; }; 
+template <> struct Int<4> { typedef int32_t T; }; 
+template <> struct Int<8> { typedef int64_t T; };
+
+template <int BYTEWIDTH> struct UInt;
+template <> struct UInt<1> { typedef uint8_t  T; }; 
+template <> struct UInt<2> { typedef uint16_t T; }; 
+template <> struct UInt<4> { typedef uint32_t T; }; 
+template <> struct UInt<8> { typedef uint64_t T; };
+// clang-format on
+
+}  // namespace typing
+}  // namespace psz
+
+// TODO forward definition in another file
+template <int BYTEWIDTH>
+struct PN {
+    using UI = typename psz::typing::UInt<BYTEWIDTH>::T;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/
+
+    static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); }
+    static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); }
+    static I  decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); }
+    static I  decode(UI x) { return (x >> 1) ^ (-(x & 1)); }
+};
diff --git a/qtensor/compression/cusz/include/cusz/record.h b/qtensor/compression/cusz/include/cusz/record.h
new file mode 100644
index 00000000..3c9be515
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/record.h
@@ -0,0 +1,38 @@
+/**
+ * @file record.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_RECORD_H
+#define CUSZ_RECORD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cusz_record_entry;
+
+struct cusz_record_entry {
+    const char* name;
+    double      time;
+
+    struct cusz_record_entry* next;
+};
+
+typedef struct cusz_record {
+    int n;
+
+    struct cusz_record_entry* head;
+} cusz_record;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/cusz/type.h b/qtensor/compression/cusz/include/cusz/type.h
new file mode 100644
index 00000000..73e66086
--- /dev/null
+++ b/qtensor/compression/cusz/include/cusz/type.h
@@ -0,0 +1,219 @@
+/**
+ * @file type.h
+ * @author Jiannan Tian
+ * @brief C-complient type definitions; no methods in this header.
+ * @version 0.3
+ * @date 2022-04-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CUSZ_TYPE_H
+#define CUSZ_TYPE_H
+
+#include "stddef.h"
+
+enum cusz_execution_policy { CPU, CUDA };
+typedef enum cusz_execution_policy cusz_execution_policy;
+typedef enum cusz_execution_policy cusz_policy;
+typedef enum cusz_execution_policy asz_policy;
+
+//////// state enumeration
+
+typedef enum cusz_error_status {  //
+    CUSZ_SUCCESS                = 0x00,
+    CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01,
+    CUSZ_FAIL_DATA_NOT_READY    = 0x02,
+    // specify error when calling CUDA API
+    CUSZ_FAIL_GPU_MALLOC,
+    CUSZ_FAIL_GPU_MEMCPY,
+    CUSZ_FAIL_GPU_ILLEGAL_ACCESS,
+    // specify error related to our own memory manager
+    CUSZ_FAIL_GPU_OUT_OF_MEMORY,
+    // when compression is useless
+    CUSZ_FAIL_INCOMPRESSIABLE,
+    // TODO component related error
+    CUSZ_FAIL_UNSUPPORTED_DATATYPE,
+    CUSZ_FAIL_UNSUPPORTED_QUANTTYPE,
+    CUSZ_FAIL_UNSUPPORTED_PRECISION,
+    CUSZ_FAIL_UNSUPPORTED_PIPELINE,
+    // not-implemented error
+    CUSZ_NOT_IMPLEMENTED = 0x0100,
+} cusz_error_status;
+
+typedef struct cusz_fixedlen_internal { /* all nullable */
+    void* encoding;
+} cusz_fixedlen_internal;
+typedef struct cusz_varlen_internal { /* all nullable */
+    void* huffman;
+    void* outlier;
+} cusz_varlen_internal;
+
+typedef enum cusz_datatype  //
+{ FP32   = 0,
+  FP64   = 1,
+  UINT8  = 10,
+  UINT16 = 11,
+  UINT32 = 12,
+  UINT64 = 13 } cusz_datatype;
+
+typedef enum cusz_executiontype  //
+{ Device = 0,
+  Host   = 1,
+  None   = 2 } cusz_executiontype;
+
+typedef enum cusz_mode  //
+{ Abs = 0,
+  Rel = 1 } cusz_mode;
+
+typedef enum cusz_pipelinetype  //
+{ Auto   = 0,
+  Dense  = 1,
+  Sparse = 2 } cusz_pipelinetype;
+
+typedef enum cusz_predictortype  //
+{ Lorenzo0  = 0,
+  LorenzoI  = 1,
+  LorenzoII = 2,
+  Spline3   = 3 } cusz_predictortype;
+
+typedef enum cusz_preprocessingtype  //
+{ FP64toFP32 = 0,
+  LogTransform,
+  ShiftedLogTransform,
+  Binning2x2,
+  Binning2x1,
+  Binning1x2,
+} cusz_preprocessingtype;
+
+typedef enum cusz_codectype  //
+{ Huffman = 0,
+  RunLength,
+  NvcompCascade,
+  NvcompLz4,
+  NvcompSnappy,
+} cusz_codectype;
+
+typedef enum cusz_spcodectype  //
+{ SparseMat = 0,
+  SparseVec = 1 } cusz_spcodectype;
+
+typedef enum cusz_huffman_booktype  //
+{ Tree      = 0,
+  Canonical = 1 } cusz_huffman_booktype;
+
+typedef enum cusz_huffman_codingtype  //
+{ Coarse = 0,
+  Fine   = 1 } cusz_huffman_codingtype;
+
+//////// configuration template
+typedef struct cusz_custom_len {
+    // clang-format off
+    union { size_t x0, x; };
+    union { size_t x1, y; };
+    union { size_t x2, z; };
+    union { size_t x3, w; };
+    // double factor;
+    // clang-format on
+} cusz_custom_len;
+typedef cusz_custom_len cusz_len;
+
+typedef struct cusz_custom_preprocessing {
+    cusz_custom_len         before;
+    cusz_custom_len         after;
+    cusz_preprocessingtype* list;
+    int                     nstep;
+
+} cusz_custom_preprocessing;
+
+typedef struct cusz_custom_predictor {
+    cusz_predictortype type;
+
+    bool anchor;
+    bool nondestructive;
+} cusz_custom_predictor;
+
+typedef struct cusz_custom_quantization {
+    int  radius;
+    bool delayed;
+} cusz_custom_quantization;
+
+typedef struct cusz_custom_codec {
+    cusz_codectype type;
+
+    bool  variable_length;
+    float presumed_density;
+} cusz_custom_codec;
+
+typedef struct cusz_custom_huffman_codec {
+    cusz_huffman_booktype   book;
+    cusz_executiontype      book_policy;
+    cusz_huffman_codingtype coding;
+
+    int booklen;
+    int coarse_pardeg;
+} cusz_custom_huffman_codec;
+
+typedef struct cusz_custom_spcodec {
+    cusz_spcodectype type;
+    float            presumed_density;
+} cusz_custom_spcodec;
+
+////// wrap-up
+
+/**
+ * @deprecated The framework could be simplifed & unified.
+ */
+typedef struct cusz_custom_framework {
+    cusz_datatype     datatype;
+    cusz_pipelinetype pipeline;
+
+    cusz_custom_predictor    predictor;
+    cusz_custom_quantization quantization;
+    cusz_custom_codec        codec;
+    // cusz_custom_spcodec      spcodec;
+
+    cusz_custom_huffman_codec huffman;
+} cusz_custom_framework;
+
+typedef cusz_custom_framework cusz_framework;
+
+typedef struct cusz_compressor_redundancy_compat_purpose {
+    void*           compressor;
+    cusz_framework* framework;
+    cusz_datatype   type;
+} cusz_compressor_compat;
+
+typedef cusz_compressor_compat cusz_compressor;
+
+typedef struct cusz_runtime_config {
+    double    eb;
+    cusz_mode mode;
+} cusz_runtime_config;
+typedef cusz_runtime_config cusz_config;
+
+typedef struct Res {
+    double min, max, rng, std;
+} Res;
+
+typedef struct cusz_stats {
+    // clang-format off
+    Res odata, xdata;
+    struct { double PSNR, MSE, NRMSE, coeff; } reduced;
+    struct { double abs, rel, pwrrel; size_t idx; } max_err;
+    struct { double lag_one, lag_two; } autocor;
+    double user_eb;
+    size_t len;
+    // clang-format on
+} cusz_stats;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/qtensor/compression/cusz/include/framework.hh b/qtensor/compression/cusz/include/framework.hh
new file mode 100644
index 00000000..9655fe25
--- /dev/null
+++ b/qtensor/compression/cusz/include/framework.hh
@@ -0,0 +1,62 @@
+/**
+ * @file framework.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ * (create) 2021-10-06 (rev) 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_FRAMEWORK
+#define CUSZ_FRAMEWORK
+
+#include "component.hh"
+#include "compressor.hh"
+
+namespace cusz {
+
+template <typename InputDataType, bool FastLowPrecision = true>
+struct Framework {
+   public:
+    /**
+     *
+     *   Predictor<T, E, (FP)>
+     *             |  |   ^
+     *             v  |   |
+     *     Spcodec<T> |   +---- default "fast-lowlowprecision"
+     *                v
+     *        Encoder<E, H>
+     */
+
+    using DATA    = InputDataType;
+    using ERRCTRL = ErrCtrlTrait<4, false>::type;  // predefined for mem. overlapping
+    using FP      = typename FastLowPrecisionTrait<FastLowPrecision>::type;
+    using Huff4   = HuffTrait<4>::type;
+    using Huff8   = HuffTrait<8>::type;
+    using Meta4   = MetadataTrait<4>::type;
+
+    template <class Codec, class FallbackCodec>
+    struct CompressorTemplate;
+
+    /* Predictor */
+    using CompatPurposePredictor = typename cusz::PredictionUnified<DATA, ERRCTRL, FP>;
+    using Predictor              = CompatPurposePredictor;
+
+    using CompatPurposeSpcodec = typename cusz::SpcodecVec<DATA, Meta4>;
+    using Spcodec              = CompatPurposeSpcodec;
+
+    /* Lossless Codec*/
+    using CodecHuffman32 = cusz::LosslessCodec<ERRCTRL, Huff4, Meta4>;
+    using CodecHuffman64 = cusz::LosslessCodec<ERRCTRL, Huff8, Meta4>;
+    using Codec          = CodecHuffman32;
+    using FallbackCodec  = CodecHuffman64;
+};
+
+using CompressorFP32 = cusz::Compressor<cusz::Framework<float>>;
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/header.h b/qtensor/compression/cusz/include/header.h
new file mode 100644
index 00000000..05287edc
--- /dev/null
+++ b/qtensor/compression/cusz/include/header.h
@@ -0,0 +1,111 @@
+#ifndef CUSZ_HEADER_H
+#define CUSZ_HEADER_H
+
+/**
+ * @file header.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-01-22
+ * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26
+ *
+ * @copyright (C) 2020 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct alignas(128) cusz_header {
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int VLE    = 2;
+    static const int SPFMT  = 3;
+
+    static const int END = 4;
+
+    uint32_t self_bytes : 16;
+    uint32_t fp : 1;
+    uint32_t byte_vle : 4;  // 4, 8
+    uint32_t nz_density_factor : 8;
+    uint32_t codecs_in_use : 2;
+    uint32_t vle_pardeg;
+    uint32_t x, y, z, w;
+    double   eb;
+    uint32_t radius : 16;
+
+    uint32_t entry[END + 1];
+
+    // uint32_t byte_uncompressed : 4;  // T; 1, 2, 4, 8
+    // uint32_t byte_errctrl : 3;       // 1, 2, 4
+    // uint32_t byte_meta : 4;          // 4, 8
+    // uint32_t ndim : 3;               // 1,2,3,4
+    // size_t   data_len;
+    // size_t   errctrl_len;
+
+} cusz_header;
+
+typedef cusz_header cuszHEADER;
+
+typedef struct alignas(128) v2_cusz_header {
+    // data segments
+    static const int HEADER = 0;
+    static const int ANCHOR = 1;
+    static const int SP_IDX = 2;
+    static const int SP_VAL = 3;
+    static const int HF     = 4;
+    static const int END    = 5;
+    uint32_t         entry[END + 1];
+
+    struct {
+        uint32_t precision : 1;
+    } data;
+
+    uint32_t x, y, z, w;
+
+    // struct {
+    // uint32_t codecs_in_use : 2;
+    double   eb;
+    uint32_t radius : 16;
+    // } config;
+
+    struct {
+        uint32_t factor : 8;  // density = 1/factor
+        uint32_t count;
+    } sp;
+
+    struct {
+        uint32_t rep_bytes : 4;  // 4, 8
+        uint32_t sublen : 28;
+        uint32_t pardeg;
+    } hf;
+
+    // TODO replace the following with hf.VAR
+    uint32_t vle_pardeg;
+
+} psz_header;
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace cusz {
+
+using Header   = cusz_header;
+using header_t = cusz_header*;
+
+}  // namespace cusz
+
+namespace psz {
+
+using v2_header = v2_cusz_header;
+
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf.hh b/qtensor/compression/cusz/include/hf/hf.hh
new file mode 100644
index 00000000..37438abb
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf.hh
@@ -0,0 +1,170 @@
+/**
+ * @file codec.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_CODECS_HH
+#define CUSZ_COMPONENT_CODECS_HH
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <memory>
+
+#include "hf/hf_struct.h"
+
+#define DEFINE_ARRAY(VAR, TYPE) \
+    TYPE* d_##VAR{nullptr};     \
+    TYPE* h_##VAR{nullptr};
+
+namespace cusz {
+
+template <typename T, typename H, typename M>
+class LosslessCodec
+// : CodecInterface<T, H, M>
+{
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~LosslessCodec();                                // dtor
+    LosslessCodec();                                 // ctor
+    LosslessCodec(const LosslessCodec&);             // copy ctor
+    LosslessCodec& operator=(const LosslessCodec&);  // copy assign
+    LosslessCodec(LosslessCodec&&);                  // move ctor
+    LosslessCodec& operator=(LosslessCodec&&);       // move assign
+
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+    float get_time_elapsed() const;
+    float get_time_book() const;
+    float get_time_lossless() const;
+};
+
+template <typename T, typename H, typename M>
+class LosslessCodec<T, H, M>::impl {
+   public:
+    using Origin    = T;
+    using Encoded   = H;
+    using MetadataT = M;
+    using FreqT     = uint32_t;
+    using BYTE      = uint8_t;
+
+   private:
+    using BOOK = H;
+    using SYM  = T;
+
+    // TODO shared header
+    struct alignas(128) Header {
+        static const int HEADER    = 0;
+        static const int REVBOOK   = 1;
+        static const int PAR_NBIT  = 2;
+        static const int PAR_ENTRY = 3;
+        static const int BITSTREAM = 4;
+        static const int END       = 5;
+
+        int       self_bytes : 16;
+        int       booklen : 16;
+        int       sublen;
+        int       pardeg;
+        size_t    uncompressed_len;
+        size_t    total_nbit;
+        size_t    total_ncell;  // TODO change to uint32_t
+        MetadataT entry[END + 1];
+
+        MetadataT subfile_size() const { return entry[END]; }
+    };
+
+    struct runtime_encode_helper {
+        static const int TMP       = 0;
+        static const int FREQ      = 1;
+        static const int BOOK      = 2;
+        static const int REVBOOK   = 3;
+        static const int PAR_NBIT  = 4;
+        static const int PAR_NCELL = 5;
+        static const int PAR_ENTRY = 6;
+        static const int BITSTREAM = 7;
+        static const int END       = 8;
+
+        uint32_t nbyte[END];
+    };
+
+    using RTE    = runtime_encode_helper;
+    using Header = struct Header;
+
+   private:
+    // array
+    DEFINE_ARRAY(tmp, H);
+    DEFINE_ARRAY(compressed, BYTE);  // alias in address
+    DEFINE_ARRAY(book, H);
+    DEFINE_ARRAY(revbook, BYTE);
+
+    DEFINE_ARRAY(par_metadata, M);
+    DEFINE_ARRAY(par_nbit, M);
+    DEFINE_ARRAY(par_ncell, M);
+    DEFINE_ARRAY(par_entry, M);
+
+    DEFINE_ARRAY(bitstream, H);
+    // helper
+    RTE rte;
+    // memory
+    static const int CELL_BITWIDTH = sizeof(H) * 8;
+    // timer
+    float milliseconds{0.0};
+    float time_hist{0.0}, time_book{0.0}, time_lossless{0.0};
+
+    hf_book*      book_desc;
+    hf_chunk*     chunk_desc_d;
+    hf_chunk*     chunk_desc_h;
+    hf_bitstream* bitstream_desc;
+
+   public:
+    ~impl();  // dtor
+    impl();   // ctor
+
+    // getter
+    float         get_time_elapsed() const;
+    float         get_time_book() const;
+    float         get_time_lossless() const;
+    size_t        get_workspace_nbyte(size_t) const;
+    size_t        get_max_output_nbyte(size_t len) const;
+    static size_t get_revbook_nbyte(int);
+    // getter for internal array
+    H*    expose_book() const;
+    BYTE* expose_revbook() const;
+    // compile-time
+    constexpr bool can_overlap_input_and_firstphase_encode();
+    // public methods
+    void init(size_t const, int const, int const, bool dbg_print = false);
+    void build_codebook(uint32_t*, int const, cudaStream_t = nullptr);
+    void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr);
+    void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+
+   private:
+    void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr);
+    void dbg_println(const std::string, void*, int);
+};
+
+}  // namespace cusz
+
+#undef DEFINE_ARRAY
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_bookg.hh b/qtensor/compression/cusz/include/hf/hf_bookg.hh
new file mode 100644
index 00000000..f6187164
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_bookg.hh
@@ -0,0 +1,45 @@
+/**
+ * @file huffman_parbook.cuh
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook (header).
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 2020-06
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef PAR_HUFFMAN_H
+#define PAR_HUFFMAN_H
+
+// Parallel huffman global memory and kernels
+namespace asz {
+
+/**
+ * @brief get codebook and reverse codebook in parallel
+ *
+ * @tparam T input type
+ * @tparam H codebook type
+ * @param freq input device array; frequency
+ * @param codebook output device array; codebook for encoding
+ * @param dict_size dictionary size; len of freq or codebook
+ * @param reverse_codebook output device array; reverse codebook for decoding
+ * @param time_book the returned time
+ */
+template <typename T, typename H>
+void hf_buildbook_g(
+    uint32_t* freq,
+    int const booksize,
+    H*        codebook,
+    uint8_t*  reverse_codebook,
+    int const revbook_nbyte,
+    float*    time_book,
+    cudaStream_t = nullptr);
+
+}  // namespace asz
+
+#endif
diff --git a/qtensor/compression/cusz/include/hf/hf_codecg.hh b/qtensor/compression/cusz/include/hf/hf_codecg.hh
new file mode 100644
index 00000000..faad837a
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_codecg.hh
@@ -0,0 +1,82 @@
+/**
+ * @file launch_lossless.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "hf_struct.h"
+
+template <int WIDTH>
+struct PackedWordByWidth;
+
+template <>
+struct PackedWordByWidth<4> {
+    uint32_t word : 24;
+    uint32_t bits : 8;
+};
+
+template <>
+struct PackedWordByWidth<8> {
+    uint64_t word : 56;
+    uint64_t bits : 8;
+};
+
+namespace asz {
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+template <typename T, typename H, typename M>
+void hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream);
+
+template <typename T, typename H, typename M>
+void hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream);
+
+}  // namespace asz
+
+#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */
diff --git a/qtensor/compression/cusz/include/hf/hf_struct.h b/qtensor/compression/cusz/include/hf/hf_struct.h
new file mode 100644
index 00000000..20ccf206
--- /dev/null
+++ b/qtensor/compression/cusz/include/hf/hf_struct.h
@@ -0,0 +1,53 @@
+/**
+ * @file hf_struct.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-14
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef DA6883A3_A70F_4690_A4FA_56644987725A
+#define DA6883A3_A70F_4690_A4FA_56644987725A
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// raw pointer array; regardless of being on host or device
+typedef struct hf_book {
+    uint32_t* freq;
+    // undertermined on definition; could be uint32_t* and uint64_t*
+    void* book;
+    int   booklen;
+} hf_book;
+
+// typedef struct hf_revbook {
+// } hf_revbook;
+
+typedef struct hf_chunk {
+    void* bits;     // how many bits each chunk
+    void* cells;    // how many cells each chunk
+    void* entries;  // jump to the chunk
+} hf_chunk;
+
+typedef struct hf_bitstream {
+    void*     buffer;
+    void*     bitstream;
+    hf_chunk* d_metadata;
+    hf_chunk* h_metadata;
+    int       sublen;  // data chunksize
+    int       pardeg;  // runtime paralleism degree
+    int       numSMs;  // number of streaming multiprocessor
+} hf_bitstream;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */
diff --git a/qtensor/compression/cusz/include/kernel/claunch_cuda.h b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
new file mode 100644
index 00000000..f160b5a3
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/claunch_cuda.h
@@ -0,0 +1,49 @@
+/**
+ * @file claunch_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_CUDA_H
+#define KERNEL_CUDA_H
+
+#include <cuda_runtime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "../cusz/type.h"
+// #include "../hf/hf_struct.h"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream);                                \
+                                                                                                                     \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream);
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+#undef C_COARSE_HUFFMAN_DECODE
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
new file mode 100644
index 00000000..7d35d59e
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh
@@ -0,0 +1,51 @@
+/**
+ * @file cpplaunch_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-27
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef COMPONENT_CALL_KERNEL_HH
+#define COMPONENT_CALL_KERNEL_HH
+
+#include "../cusz/type.h"
+#include "../hf/hf_struct.h"
+
+namespace cusz {
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_construct_Spline3(
+    bool         NO_R_SEPARATE,
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+// 22-10-27 revise later
+template <typename T, typename E, typename FP>
+cusz_error_status cpplaunch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           eq,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/dryrun.cuh b/qtensor/compression/cusz/include/kernel/dryrun.cuh
new file mode 100644
index 00000000..d32800c1
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/dryrun.cuh
@@ -0,0 +1,47 @@
+/**
+ * @file dryrun.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ dryrun mode, checking data quality from lossy compression.
+ * @version 0.3
+ * @date 2020-09-20
+ * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_DRYRUN_CUH
+#define CUSZ_KERNEL_DRYRUN_CUH
+
+namespace cusz {
+
+template <typename Data = float, typename FP = float, int BLOCK = 256, int SEQ = 4>
+// template <typename Data = float, typename FP = float>
+__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2)
+{
+    {
+        constexpr auto  NTHREAD = BLOCK / SEQ;
+        __shared__ Data shmem[BLOCK];
+        auto            id_base = blockIdx.x * BLOCK;
+
+#pragma unroll
+        for (auto i = 0; i < SEQ; i++) {
+            auto id = id_base + threadIdx.x + i * NTHREAD;
+            if (id < len) {
+                shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2;
+                out_xdata[id]                    = shmem[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+
+    // simplistic
+    // {
+    //     auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    //     if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2;
+    // }
+}
+
+}  // namespace cusz
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/kernel/launch_prototype.cuh b/qtensor/compression/cusz/include/kernel/launch_prototype.cuh
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/cusz/include/kernel/launch_spm.cuh b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
new file mode 100644
index 00000000..4f0bcdd9
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/launch_spm.cuh
@@ -0,0 +1,348 @@
+/**
+ * @file launch_sparse_method.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-06-13
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH
+#define CUSZ_LAUNCH_SPARSE_METHOD_CUH
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include "../common.hh"
+#include "../utils.hh"
+#include "../utils/cusparse_err.cuh"
+
+// #if CUDART_VERSION >= 11020
+
+template <typename T, typename M>
+void launch_cusparse_gather_cuda11200_onward(
+    cusparseHandle_t     handle,
+    T*                   in_dense,
+    uint32_t const       num_rows,
+    uint32_t const       num_cols,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    M*                   d_rowptr,
+    M*                   d_colidx,
+    T*                   d_val,
+    int64_t&             nnz,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto gather11_init_mat = [&]() {
+        // create dense matrix wrapper
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+
+        // create CSR wrapper
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+    };
+
+    auto gather11_init_buffer = [&]() {
+        {  // allocate an external buffer if needed
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize(
+                handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+
+            CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+        }
+    };
+
+    auto gather11_analysis = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    int64_t num_rows_tmp, num_cols_tmp;
+
+    auto gather11_get_nnz = [&]() {
+        // get number of non-zero elements
+        CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz));
+    };
+
+    auto gather11_get_rowptr = [&]() {
+        // reset offsets, column indices, and values pointers
+        CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val));
+    };
+
+    auto gather11_dn2csr = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));  // TODO move out
+
+    gather11_init_mat();
+    gather11_init_buffer();
+    gather11_analysis();
+    gather11_get_nnz();
+    gather11_get_rowptr();
+    gather11_dn2csr();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// void SpcodecCSR<T, M>::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool
+// header_on_device)
+
+template <typename T, typename M>
+void launch_cusparse_scatter_cuda11200_onward(
+    cusparseHandle_t     handle,
+    int*                 d_rowptr,
+    int*                 d_colidx,
+    T*                   d_val,
+    int const            num_rows,
+    int const            num_cols,
+    int const            nnz,
+    cusparseDnMatDescr_t dnmat,
+    cusparseSpMatDescr_t spmat,
+    void*                d_buffer,
+    size_t&              d_buffer_size,
+    T*                   out_dense,
+    float&               milliseconds,
+    cudaStream_t         stream)
+{
+    auto ld = num_rows;
+
+    auto scatter11_init_mat = [&]() {
+        CHECK_CUSPARSE(cusparseCreateCsr(
+            &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+            CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE<T>::type));
+
+        CHECK_CUSPARSE(
+            cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE<T>::type, CUSPARSE_ORDER_ROW));
+    };
+
+    auto scatter11_init_buffer = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        // allocate an external buffer if needed
+        CHECK_CUSPARSE(
+            cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+
+        CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size));
+    };
+
+    auto scatter11_csr2dn = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+    };
+
+    /******************************************************************************/
+    milliseconds = 0;
+
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter11_init_mat();
+    scatter11_init_buffer();
+    scatter11_csr2dn();
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseDestroySpMat(spmat));
+    CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat));
+    CHECK_CUSPARSE(cusparseDestroy(handle));
+}
+
+// #elif CUDART_VERSION >= 10000
+
+template <typename T, typename M>
+void launch_cusparse_gather_before_cuda11200(
+    cusparseHandle_t   handle,
+    T*                 in_dense,
+    uint32_t const     num_rows,
+    uint32_t const     num_cols,
+    cusparseMatDescr_t mat_desc,
+    void*              d_work,
+    size_t&            lwork_in_bytes,
+    M*                 d_rowptr,
+    M*                 d_colidx,
+    T*                 d_val,
+    int&               nnz,  // int is for compatibility; cuSPARSE of CUDA 11 changed data type
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    float threshold{0};
+    auto  has_ext_stream{false};
+
+    /******************************************************************************/
+
+    auto gather10_init_and_probe = [&]() {
+        {  // init
+
+            CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create rte.mat_desc
+            CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+            CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+        }
+
+        {  // probe
+            cuda_timer_t t;
+            t.timer_start(stream);
+
+            CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt(
+                handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx,
+                &lwork_in_bytes));
+
+            t.timer_end(stream);
+            milliseconds += t.get_time_elapsed();
+        }
+
+        if (nullptr != d_work) cudaFree(d_work);
+        CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes));  // TODO where to release d_work?
+    };
+
+    auto gather10_compute_rowptr_and_nnz = [&]() {  // step 4
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csrNnz(
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    };
+
+    auto gather10_compute_colidx_and_val = [&]() {  // step 5
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(cusparseSpruneDense2csr(  //
+            handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work));
+
+        t.timer_end(stream);
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /********************************************************************************/
+    milliseconds = 0;
+
+    if (stream)
+        has_ext_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));  // 1. create stream
+    CHECK_CUSPARSE(cusparseCreate(&handle));                                    // 2. create handle
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));                          // 3. bind stream
+
+    gather10_init_and_probe();
+    gather10_compute_rowptr_and_nnz();
+    if (nnz == 0) { return; }
+    gather10_compute_colidx_and_val();
+
+    // TODO no need to destroy?
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_ext_stream) and stream) cudaStreamDestroy(stream);
+    /********************************************************************************/
+}
+
+// #endif
+
+template <typename T, typename M>
+void launch_cusparse_scatter_before_cuda11200(
+    cusparseHandle_t   handle,
+    int*               d_rowptr,
+    int*               d_colidx,
+    T*                 d_val,
+    int const          num_rows,
+    int const          num_cols,
+    int const          nnz,
+    cusparseMatDescr_t mat_desc,
+    void*              d_buffer,
+    size_t&            d_buffer_size,
+    T*                 out_dense,
+    float&             milliseconds,
+    cudaStream_t       stream)
+{
+    auto ld = num_rows;
+
+    auto has_external_stream = false;
+
+    /******************************************************************************/
+
+    auto scatter10_init = [&]() {
+        CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc));                            // 4. create descr
+        CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO));  // zero based
+        CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL));   // type
+    };
+
+    auto scatter10_sparse2dense = [&]() {
+        cuda_timer_t t;
+        t.timer_start(stream);
+
+        CHECK_CUSPARSE(
+            cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld));
+
+        t.timer_end();
+        milliseconds += t.get_time_elapsed();
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    };
+
+    /******************************************************************************/
+    if (stream)
+        has_external_stream = true;
+    else
+        CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CHECK_CUSPARSE(cusparseCreate(&handle));
+    CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+
+    scatter10_init();
+    scatter10_sparse2dense();
+
+    if (handle) cusparseDestroy(handle);
+    if (mat_desc) cusparseDestroyMatDescr(mat_desc);
+    if ((not has_external_stream) and stream) cudaStreamDestroy(stream);
+    /******************************************************************************/
+}
+
+#endif
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.h b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
new file mode 100644
index 00000000..de9f087e
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.h
@@ -0,0 +1,44 @@
+/**
+ * @file kernel_cuda.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cusz/type.h"
+
+#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP)                                           \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \
+        cudaStream_t stream);                                                                         \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float);
+C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float);
+C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float);
+C_LORENZOI(fp32, fp32, fp32, float, float, float);
+
+C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double);
+C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double);
+C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double);
+C_LORENZOI(fp64, fp32, fp64, double, float, double);
+
+#undef C_LORENZOI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */
diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
new file mode 100644
index 00000000..d87baffa
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh
@@ -0,0 +1,96 @@
+/**
+ * @file kernel_cuda.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714
+#define C8C37773_7EF2_439B_B0EF_14D0058DC714
+
+#include <stdint.h>
+#include "cusz/type.h"
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ = int32_t, typename FP = T>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+namespace asz {
+namespace experimental {
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream);
+
+}  // namespace experimental
+}  // namespace asz
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,          // input
+    dim3 const   len3,          //
+    double const eb,            // input (config)
+    int const    radius,        //
+    EQ* const    eq,            // output
+    T*           outlier,       //
+    uint32_t*    outlier_idx,   //
+    uint32_t*    num_outliers,  //
+    float*       time_elapsed,  // optional
+    cudaStream_t stream);       //
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,            // input
+    dim3 const     len3,          //
+    T*             outlier,       //
+    uint32_t*      outlier_idx,   //
+    uint32_t const num_outliers,  //
+    double const   eb,            // input (config)
+    int const      radius,        //
+    T*             xdata,         // output
+    float*         time_elapsed,  // optional
+    cudaStream_t   stream);
+
+#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.h b/qtensor/compression/cusz/include/kernel/spv_gpu.h
new file mode 100644
index 00000000..496dd4eb
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.h
@@ -0,0 +1,42 @@
+/**
+ * @file spv_gpu.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                      \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \
+                                                                                                                    \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                     \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */
diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.hh b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
new file mode 100644
index 00000000..c2f021df
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/spv_gpu.hh
@@ -0,0 +1,33 @@
+/**
+ * @file spv_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef A54D2009_1D4F_4113_9E26_9695A3669224
+#define A54D2009_1D4F_4113_9E26_9695A3669224
+#include <cstdint>
+
+namespace psz {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream);
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream);
+
+}  // namespace psz
+
+#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */
diff --git a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
new file mode 100644
index 00000000..7c8d4ce0
--- /dev/null
+++ b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh
@@ -0,0 +1,32 @@
+/**
+ * @file v2_lorenzo.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762
+#define CD52BDA6_9376_43FF_BFDA_693204FA8762
+
+#include "compaction.hh"
+#include "cusz/type.h"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,          // input
+    dim3 const        data_len3,     //
+    double const      eb,            // input (config)
+    int const         radius,        //
+    E* const          eq,            // output
+    dim3 const        eq_len3,       //
+    T* const          anchor,        //
+    dim3 const        anchor_len3,   //
+    CompactionDRAM<T> outlier,       //
+    float*            time_elapsed,  // optional
+    cudaStream_t      stream);            //
+
+#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */
diff --git a/qtensor/compression/cusz/include/pipeline/compaction_g.inl b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
new file mode 100644
index 00000000..7a854101
--- /dev/null
+++ b/qtensor/compression/cusz/include/pipeline/compaction_g.inl
@@ -0,0 +1,73 @@
+/**
+ * @file compaction_g.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA
+#define F712F74C_7488_4445_83EE_EE7F88A64BBA
+
+#include <cuda_runtime.h>
+#include <cstring>
+#include "compaction.hh"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// TODO filename -> `compaction`
+template <typename T>
+struct CompactionDRAM {
+    using type = T;
+    T*        val;
+    uint32_t* idx;
+    uint32_t* count;
+    uint32_t* h_count;
+
+    void allocate(size_t len, bool device = true)
+    {
+        if (device) {
+            cudaMalloc(&idx, sizeof(uint32_t) * len);
+            cudaMalloc(&val, sizeof(T) * len);
+            cudaMalloc(&count, sizeof(T) * 1);
+            cudaMallocHost(&h_count, sizeof(T) * 1);
+        }
+        else {
+            cudaMallocHost(&idx, sizeof(uint32_t) * len);
+            cudaMallocHost(&val, sizeof(T) * len);
+            cudaMallocHost(&count, sizeof(T) * 1);
+
+            memset(count, 0x0, sizeof(T) * 1);
+        }
+    }
+
+    void make_count_host_accessible(cudaStream_t stream)
+    {
+        cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream);
+    }
+
+    uint32_t access_count_on_host() { return *h_count; }
+
+    void allocate_managed(size_t len)
+    {
+        cudaMallocManaged(&idx, sizeof(uint32_t) * len);
+        cudaMallocManaged(&val, sizeof(T) * len);
+        cudaMallocManaged(&count, sizeof(T) * 1);
+
+        cudaMemset(count, 0x0, sizeof(T) * 1);
+    }
+
+    void destroy()
+    {
+        if (h_count) cudaFreeHost(h_count);
+        cudaFree(idx);
+        cudaFree(val);
+        cudaFree(count);
+    }
+};
+
+#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */
diff --git a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
new file mode 100644
index 00000000..5e0c8a83
--- /dev/null
+++ b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh
@@ -0,0 +1,146 @@
+/**
+ * @file v2_compressor.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "common/type_traits.hh"
+#include "compaction.hh"
+#include "component.hh"
+#include "context.hh"
+#include "header.h"
+
+// TODO move outward
+#include "compaction_g.inl"
+
+using Context = cusz::Context;
+
+namespace psz {
+
+template <class CONFIG>
+class v2_Compressor {
+   public:
+    using BYTE = uint8_t;
+
+    using T    = typename CONFIG::Predictor::Origin;
+    using FP   = typename CONFIG::Predictor::Precision;
+    using E    = typename CONFIG::Predictor::ErrCtrl;
+    using H    = typename CONFIG::Codec::Encoded;
+    using M    = typename CONFIG::Codec::MetadataT;
+    using H_FB = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+
+   public:
+    ~v2_Compressor();
+    v2_Compressor();
+    v2_Compressor(const v2_Compressor&);
+    v2_Compressor& operator=(const v2_Compressor&);
+    v2_Compressor(v2_Compressor&&);
+    v2_Compressor& operator=(v2_Compressor&&);
+
+    // methods
+    void init(Context*);
+    void init(v2_header*);
+    void destroy();
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+    void clear_buffer();
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    void export_timerecord(TimeRecord*);
+};
+
+template <class CONFIG>
+class v2_Compressor<CONFIG>::impl {
+   public:
+    using Codec = typename CONFIG::Codec;
+    using BYTE  = uint8_t;
+    using T     = typename CONFIG::Predictor::Origin;
+    using FP    = typename CONFIG::Predictor::Precision;
+    using EQ    = uint32_t;
+    using H     = typename CONFIG::Codec::Encoded;
+    using M     = uint32_t;
+    using IDX   = uint32_t;
+    using H_FB  = typename CONFIG::FallbackCodec::Encoded;
+
+    using TimeRecord   = std::vector<std::tuple<const char*, double>>;
+    using timerecord_t = TimeRecord*;
+
+   private:
+    // state
+    // bool  use_fallback_codec{false};
+    // bool  fallback_codec_allocated{false};
+
+    BYTE* d_reserved_for_archive{nullptr};
+
+    // profiling
+    // TimeRecord timerecord;
+    // header
+    v2_header header;
+    // components
+
+    Codec* codec;
+
+    // arrays
+    T*                d_anchor;
+    uint32_t*         d_errctrl;
+    uint32_t*         d_freq;
+    CompactionDRAM<T> outlier;
+
+    int sp_factor{20};
+
+    struct {
+        float construct, hist, encode;
+    } comp_time;
+
+    struct {
+        float scatter, decode, reconstruct;
+    } decomp_time;
+
+    dim3   data_len3;
+    size_t data_len;
+
+   public:
+    ~impl();
+    impl();
+
+    // public methods
+    void init(Context* config);
+    void init(v2_header* config);
+
+    void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false);
+    void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true);
+
+    // getter
+    void export_header(v2_header&);
+    void export_header(v2_header*);
+    // void export_timerecord(TimeRecord*);
+    BYTE* var_archive() { return d_reserved_for_archive; };
+
+   private:
+    // helper
+    template <class ContextOrHeader>
+    void __init(ContextOrHeader*);
+
+    // void collect_compress_timerecord();
+    // void collect_decompress_timerecord();
+    void destroy();
+    // getter
+};
+
+}  // namespace psz
diff --git a/qtensor/compression/cusz/include/stat/compare.h b/qtensor/compression/cusz/include/stat/compare.h
new file mode 100644
index 00000000..bc60fb0b
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare.h
@@ -0,0 +1,57 @@
+/**
+ * @file compare.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2
+#define CE05A256_23CB_4243_8839_B1FDA9C540D2
+
+#ifdef __cplus_plus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "../cusz/type.h"
+
+#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]);
+
+#define COMPARE_LOSSLESS(Tliteral, T)                                  \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len);
+
+#define COMPARE_LOSSY(Tliteral, T)                                                                                     \
+    bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);    \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);                       \
+                                                                                                                       \
+    bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+DESCRIPTION(ui8, uint8_t)
+DESCRIPTION(ui16, uint16_t)
+DESCRIPTION(ui32, uint32_t)
+DESCRIPTION(fp32, float)
+DESCRIPTION(fp64, double)
+
+COMPARE_LOSSLESS(fp32, float)
+COMPARE_LOSSLESS(fp64, double)
+COMPARE_LOSSLESS(ui8, uint8_t)
+COMPARE_LOSSLESS(ui16, uint16_t)
+COMPARE_LOSSLESS(ui32, uint32_t)
+
+COMPARE_LOSSY(fp32, float)
+COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE
+
+#ifdef __cplus_plus
+}
+#endif
+
+#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */
diff --git a/qtensor/compression/cusz/include/stat/compare_cpu.hh b/qtensor/compression/cusz/include/stat/compare_cpu.hh
new file mode 100644
index 00000000..3cd6c421
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare_cpu.hh
@@ -0,0 +1,62 @@
+/**
+ * @file compare_cpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C93C3857_8821_4988_B6F0_4E885060F642
+#define C93C3857_8821_4988_B6F0_4E885060F642
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                        \
+    template <>                                                     \
+    bool psz::cppstd_identical<T>(T * d1, T * d2, size_t const len) \
+    {                                                               \
+        return cppstd_identical_T##Tliteral(d1, d2, len);           \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    template <>                                                                                                  \
+    bool psz::cppstd_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                            \
+        return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }                                                                                                            \
+                                                                                                                 \
+    template <>                                                                                                  \
+    void psz::cppstd_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len)                   \
+    {                                                                                                            \
+        cppstd_assess_quality_T##Tliteral(s, xdata, odata, len);                                                 \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float);
+CPPSTD_COMPARE_LOSSY(fp64, double);
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
+
+#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */
diff --git a/qtensor/compression/cusz/include/stat/compare_gpu.hh b/qtensor/compression/cusz/include/stat/compare_gpu.hh
new file mode 100644
index 00000000..78013ca7
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/compare_gpu.hh
@@ -0,0 +1,33 @@
+/**
+ * @file compare_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862
+
+#include "compare.h"
+
+namespace psz {
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]);
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len);
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx);
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len);
+
+}  // namespace psz
+
+#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */
diff --git a/qtensor/compression/cusz/include/stat/stat.h b/qtensor/compression/cusz/include/stat/stat.h
new file mode 100644
index 00000000..ade8deea
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat.h
@@ -0,0 +1,29 @@
+/**
+ * @file stat.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA
+#define BBBB5712_FF60_4262_B927_85B113FD26BA
+
+#include "cusz/type.h"
+
+#define HIST_C(Tname, T)                                                                                 \
+    cusz_error_status histogram_T##Tname(                                                                \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \
+        cudaStream_t stream);
+
+HIST_C(ui8, uint8_t)
+HIST_C(ui16, uint16_t)
+HIST_C(ui32, uint32_t)
+HIST_C(ui64, uint64_t)
+
+#undef HIST_C
+
+#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */
diff --git a/qtensor/compression/cusz/include/stat/stat.hh b/qtensor/compression/cusz/include/stat/stat.hh
new file mode 100644
index 00000000..fedf6417
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat.hh
@@ -0,0 +1,15 @@
+/**
+ * @file stat.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9
+
+#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */
diff --git a/qtensor/compression/cusz/include/stat/stat_g.hh b/qtensor/compression/cusz/include/stat/stat_g.hh
new file mode 100644
index 00000000..45f2f84d
--- /dev/null
+++ b/qtensor/compression/cusz/include/stat/stat_g.hh
@@ -0,0 +1,44 @@
+/**
+ * @file stat_g.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3
+
+#include <cuda_runtime.h>
+#include "cusz/type.h"
+
+namespace asz {
+namespace stat {
+
+/**
+ * @brief Get frequency: a kernel wrapper
+ *
+ * @tparam T input type
+ * @param in_data input device array
+ * @param in_len input host var; len of in_data
+ * @param out_freq output device array
+ * @param nbin input host var; len of out_freq
+ * @param milliseconds output time elapsed
+ * @param stream optional stream
+ */
+template <typename T>
+cusz_error_status histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    nbin,
+    float*       milliseconds,
+    cudaStream_t stream = nullptr);
+
+}  // namespace stat
+}  // namespace asz
+
+#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */
diff --git a/qtensor/compression/cusz/include/utils.hh b/qtensor/compression/cusz/include/utils.hh
new file mode 100644
index 00000000..fd15517c
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils.hh
@@ -0,0 +1,21 @@
+/**
+ * @file utils.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-07-12
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "utils/cuda_err.cuh"
+#include "utils/cuda_mem.cuh"
+#include "utils/format.hh"
+#include "utils/io.hh"
+#include "utils/strhelper.hh"
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/include/utils/cuda_err.cuh b/qtensor/compression/cusz/include/utils/cuda_err.cuh
new file mode 100644
index 00000000..5b80b04b
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cuda_err.cuh
@@ -0,0 +1,185 @@
+#ifndef CUDA_ERR_CUH
+#define CUDA_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+struct cusz_cuda_exception : public std::exception {
+  cusz_cuda_exception(const char* err, int err_code, const char* file, int line) {
+    std::stringstream ss;
+    ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')';
+    err_msg = ss.str();
+  }
+  const char* what() const noexcept {
+    return err_msg.c_str();
+  }
+  std::string err_msg;
+};
+
+// back compatibility start
+static void HandleError(cudaError_t err, const char* file, int line)
+{
+    if (err != cudaSuccess) {
+        throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line);
+    }
+}
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+// back compatibility end
+
+static void check_cuda_error(cudaError_t status, const char* file, int line)
+{
+    if (cudaSuccess != status) {
+        /*
+        printf("\nCUDA error/status reference (as of CUDA 11):\n");
+        printf("cudaSuccess                         -> %d\n", cudaSuccess);
+        printf("cudaErrorInvalidValue               -> %d\n", cudaErrorInvalidValue);
+        printf("cudaErrorMemoryAllocation           -> %d\n", cudaErrorMemoryAllocation);
+        printf("cudaErrorInitializationError        -> %d\n", cudaErrorInitializationError);
+        printf("cudaErrorCudartUnloading            -> %d\n", cudaErrorCudartUnloading);
+        printf("cudaErrorProfilerDisabled           -> %d\n", cudaErrorProfilerDisabled);
+        printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized);
+        printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted);
+        printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped);
+        printf("cudaErrorInvalidConfiguration       -> %d\n", cudaErrorInvalidConfiguration);
+        printf("cudaErrorInvalidPitchValue          -> %d\n", cudaErrorInvalidPitchValue);
+        printf("cudaErrorInvalidSymbol              -> %d\n", cudaErrorInvalidSymbol);
+        printf("cudaErrorInvalidHostPointer     (Deprecated)-> %d\n", cudaErrorInvalidHostPointer);
+        printf("cudaErrorInvalidDevicePointer   (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer);
+        printf("cudaErrorInvalidTexture             -> %d\n", cudaErrorInvalidTexture);
+        printf("cudaErrorInvalidTextureBinding      -> %d\n", cudaErrorInvalidTextureBinding);
+        printf("cudaErrorInvalidChannelDescriptor   -> %d\n", cudaErrorInvalidChannelDescriptor);
+        printf("cudaErrorInvalidMemcpyDirection     -> %d\n", cudaErrorInvalidMemcpyDirection);
+        printf("cudaErrorAddressOfConstant      (Deprecated)-> %d\n", cudaErrorAddressOfConstant);
+        printf("cudaErrorTextureFetchFailed     (Deprecated)-> %d\n", cudaErrorTextureFetchFailed);
+        printf("cudaErrorTextureNotBound        (Deprecated)-> %d\n", cudaErrorTextureNotBound);
+        printf("cudaErrorSynchronizationError   (Deprecated)-> %d\n", cudaErrorSynchronizationError);
+        printf("cudaErrorInvalidFilterSetting       -> %d\n", cudaErrorInvalidFilterSetting);
+        printf("cudaErrorInvalidNormSetting         -> %d\n", cudaErrorInvalidNormSetting);
+        printf("cudaErrorMixedDeviceExecution   (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution);
+        printf("cudaErrorNotYetImplemented      (Deprecated)-> %d\n", cudaErrorNotYetImplemented);
+        printf("cudaErrorMemoryValueTooLarge    (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge);
+        printf("cudaErrorInsufficientDriver         -> %d\n", cudaErrorInsufficientDriver);
+        printf("cudaErrorInvalidSurface             -> %d\n", cudaErrorInvalidSurface);
+        printf("cudaErrorDuplicateVariableName      -> %d\n", cudaErrorDuplicateVariableName);
+        printf("cudaErrorDuplicateTextureName       -> %d\n", cudaErrorDuplicateTextureName);
+        printf("cudaErrorDuplicateSurfaceName       -> %d\n", cudaErrorDuplicateSurfaceName);
+        printf("cudaErrorDevicesUnavailable         -> %d\n", cudaErrorDevicesUnavailable);
+        printf("cudaErrorIncompatibleDriverContext  -> %d\n", cudaErrorIncompatibleDriverContext);
+        printf("cudaErrorMissingConfiguration       -> %d\n", cudaErrorMissingConfiguration);
+        printf("cudaErrorPriorLaunchFailure     (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure);
+        printf("cudaErrorLaunchMaxDepthExceeded     -> %d\n", cudaErrorLaunchMaxDepthExceeded);
+        printf("cudaErrorLaunchFileScopedTex        -> %d\n", cudaErrorLaunchFileScopedTex);
+        printf("cudaErrorLaunchFileScopedSurf       -> %d\n", cudaErrorLaunchFileScopedSurf);
+        printf("cudaErrorSyncDepthExceeded          -> %d\n", cudaErrorSyncDepthExceeded);
+        printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded);
+        printf("cudaErrorInvalidDeviceFunction      -> %d\n", cudaErrorInvalidDeviceFunction);
+        printf("cudaErrorNoDevice                   -> %d\n", cudaErrorNoDevice);
+        printf("cudaErrorInvalidDevice              -> %d\n", cudaErrorInvalidDevice);
+        printf("cudaErrorStartupFailure             -> %d\n", cudaErrorStartupFailure);
+        printf("cudaErrorInvalidKernelImage         -> %d\n", cudaErrorInvalidKernelImage);
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorDeviceUninitialized        -> %d\n", cudaErrorDeviceUninitialized);
+    #endif
+        printf("cudaErrorMapBufferObjectFailed      -> %d\n", cudaErrorMapBufferObjectFailed);
+        printf("cudaErrorUnmapBufferObjectFailed    -> %d\n", cudaErrorUnmapBufferObjectFailed);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorArrayIsMapped              -> %d\n", cudaErrorArrayIsMapped);
+        printf("cudaErrorAlreadyMapped              -> %d\n", cudaErrorAlreadyMapped);
+    #endif
+        printf("cudaErrorNoKernelImageForDevice     -> %d\n", cudaErrorNoKernelImageForDevice);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorAlreadyAcquired            -> %d\n", cudaErrorAlreadyAcquired);
+        printf("cudaErrorNotMapped                  -> %d\n", cudaErrorNotMapped);
+        printf("cudaErrorNotMappedAsArray           -> %d\n", cudaErrorNotMappedAsArray);
+        printf("cudaErrorNotMappedAsPointer         -> %d\n", cudaErrorNotMappedAsPointer);
+    #endif
+        printf("cudaErrorECCUncorrectable           -> %d\n", cudaErrorECCUncorrectable);
+        printf("cudaErrorUnsupportedLimit           -> %d\n", cudaErrorUnsupportedLimit);
+        printf("cudaErrorDeviceAlreadyInUse         -> %d\n", cudaErrorDeviceAlreadyInUse);
+        printf("cudaErrorPeerAccessUnsupported      -> %d\n", cudaErrorPeerAccessUnsupported);
+        printf("cudaErrorInvalidPtx                 -> %d\n", cudaErrorInvalidPtx);
+        printf("cudaErrorInvalidGraphicsContext     -> %d\n", cudaErrorInvalidGraphicsContext);
+        printf("cudaErrorNvlinkUncorrectable        -> %d\n", cudaErrorNvlinkUncorrectable);
+        printf("cudaErrorJitCompilerNotFound        -> %d\n", cudaErrorJitCompilerNotFound);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorInvalidSource              -> %d\n", cudaErrorInvalidSource);
+        printf("cudaErrorFileNotFound               -> %d\n", cudaErrorFileNotFound);
+    #endif
+        printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound);
+        printf("cudaErrorSharedObjectInitFailed     -> %d\n", cudaErrorSharedObjectInitFailed);
+        printf("cudaErrorOperatingSystem            -> %d\n", cudaErrorOperatingSystem);
+        printf("cudaErrorInvalidResourceHandle      -> %d\n", cudaErrorInvalidResourceHandle);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorIllegalState               -> %d\n", cudaErrorIllegalState);
+        printf("cudaErrorSymbolNotFound             -> %d\n", cudaErrorSymbolNotFound);
+    #endif
+        printf("cudaErrorNotReady                   -> %d\n", cudaErrorNotReady);
+        printf("cudaErrorIllegalAddress             -> %d\n", cudaErrorIllegalAddress);
+        printf("cudaErrorLaunchOutOfResources       -> %d\n", cudaErrorLaunchOutOfResources);
+        printf("cudaErrorLaunchTimeout              -> %d\n", cudaErrorLaunchTimeout);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing);
+    #endif
+        printf("cudaErrorPeerAccessAlreadyEnabled   -> %d\n", cudaErrorPeerAccessAlreadyEnabled);
+        printf("cudaErrorPeerAccessNotEnabled       -> %d\n", cudaErrorPeerAccessNotEnabled);
+        printf("cudaErrorSetOnActiveProcess         -> %d\n", cudaErrorSetOnActiveProcess);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorContextIsDestroyed         -> %d\n", cudaErrorContextIsDestroyed);
+    #endif
+        printf("cudaErrorAssert                     -> %d\n", cudaErrorAssert);
+        printf("cudaErrorTooManyPeers               -> %d\n", cudaErrorTooManyPeers);
+        printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered);
+        printf("cudaErrorHostMemoryNotRegistered    -> %d\n", cudaErrorHostMemoryNotRegistered);
+        printf("cudaErrorHardwareStackError         -> %d\n", cudaErrorHardwareStackError);
+        printf("cudaErrorIllegalInstruction         -> %d\n", cudaErrorIllegalInstruction);
+        printf("cudaErrorMisalignedAddress          -> %d\n", cudaErrorMisalignedAddress);
+        printf("cudaErrorInvalidAddressSpace        -> %d\n", cudaErrorInvalidAddressSpace);
+        printf("cudaErrorInvalidPc                  -> %d\n", cudaErrorInvalidPc);
+        printf("cudaErrorLaunchFailure              -> %d\n", cudaErrorLaunchFailure);
+        printf("cudaErrorCooperativeLaunchTooLarge  -> %d\n", cudaErrorCooperativeLaunchTooLarge);
+        printf("cudaErrorNotPermitted               -> %d\n", cudaErrorNotPermitted);
+        printf("cudaErrorNotSupported               -> %d\n", cudaErrorNotSupported);
+    #if (CUDART_VERSION == 1010)
+        printf("cudaErrorSystemNotReady             -> %d\n", cudaErrorSystemNotReady);
+        printf("cudaErrorSystemDriverMismatch       -> %d\n", cudaErrorSystemDriverMismatch);
+        printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice);
+        printf("cudaErrorStreamCaptureUnsupported   -> %d\n", cudaErrorStreamCaptureUnsupported);
+        printf("cudaErrorStreamCaptureInvalidated   -> %d\n", cudaErrorStreamCaptureInvalidated);
+        printf("cudaErrorStreamCaptureMerge         -> %d\n", cudaErrorStreamCaptureMerge);
+        printf("cudaErrorStreamCaptureUnmatched     -> %d\n", cudaErrorStreamCaptureUnmatched);
+        printf("cudaErrorStreamCaptureUnjoined      -> %d\n", cudaErrorStreamCaptureUnjoined);
+        printf("cudaErrorStreamCaptureIsolation     -> %d\n", cudaErrorStreamCaptureIsolation);
+        printf("cudaErrorStreamCaptureImplicit      -> %d\n", cudaErrorStreamCaptureImplicit);
+        printf("cudaErrorCapturedEvent              -> %d\n", cudaErrorCapturedEvent);
+        printf("cudaErrorStreamCaptureWrongThread   -> %d\n", cudaErrorStreamCaptureWrongThread);
+    #endif
+    #if (CUDART_VERSION == 1100)
+        printf("cudaErrorTimeout                    -> %d\n", cudaErrorTimeout);
+        printf("cudaErrorGraphExecUpdateFailure     -> %d\n", cudaErrorGraphExecUpdateFailure);
+    #endif
+        printf("cudaErrorUnknown                    -> %d\n", cudaErrorUnknown);
+        printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase);
+        */
+        throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line);
+    }
+}
+
+#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__))
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cuda_mem.cuh b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
new file mode 100644
index 00000000..46e52e33
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cuda_mem.cuh
@@ -0,0 +1,100 @@
+#ifndef UTILS_CUDA_MEM_CUH
+#define UTILS_CUDA_MEM_CUH
+
+/**
+ * @file cuda_mem.cuh
+ * @author Jiannan Tian
+ * @brief CUDA memory operation wrappers.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+template <int NUM>
+static inline bool __is_aligned_at(const void* ptr)
+{  //
+    return reinterpret_cast<uintptr_t>(ptr) % NUM == 0;
+};
+
+template <typename T, int NUM>
+static size_t __cusz_get_alignable_len(size_t len)
+{
+    return ((sizeof(T) * len - 1) / NUM + 1) * NUM;
+}
+
+static const int CUSZ_ALIGN_NUM = 128;
+
+/**
+ * @brief when using memory pool, alignment at 128 is necessary
+ *
+ * @tparam SRC
+ * @tparam DST
+ * @param src
+ * @return DST*
+ */
+template <typename DST, typename SRC = uint8_t>
+DST* designate(SRC* src)
+{
+    // TODO check alignment
+    auto aligned = __is_aligned_at<CUSZ_ALIGN_NUM>(src);
+    if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+template <typename DST, typename SRC>
+DST* free_repurpose(SRC* src)
+{
+    // aligning at 4 byte; does not raise misalignment
+    // may not result in optimal performance considering coalescing
+    auto aligned = __is_aligned_at<4>(src);
+    if (not aligned) throw std::runtime_error("not aligned at 4 bytes");
+
+    return reinterpret_cast<DST*>(src);
+}
+
+namespace mem {
+
+enum MemcpyDirection { h2d, d2h };
+
+template <typename T>
+inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00)
+{
+    T* d_var;
+    cudaMalloc(&d_var, len * sizeof(T));
+    cudaMemset(d_var, filling_val, len * sizeof(T));
+    return d_var;
+}
+
+template <typename T>
+inline T* create_devspace_memcpy_h2d(T* var, size_t l)
+{
+    T* d_var;
+    cudaMalloc(&d_var, l * sizeof(T));
+    cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice);
+    return d_var;
+}
+template <typename T>
+inline T* create_devspace_memcpy_d2h(T* d_var, size_t l)
+{
+    // auto var = new T[l];
+    T* var;
+    cudaMallocHost(&var, l * sizeof(T));
+    cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost);
+    return var;
+}
+
+}  // namespace mem
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/cusparse_err.cuh b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
new file mode 100644
index 00000000..e2f77bb6
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/cusparse_err.cuh
@@ -0,0 +1,60 @@
+#ifndef UTILS_CUSPARSE_ERR_CUH
+#define UTILS_CUSPARSE_ERR_CUH
+
+/**
+ * @file cuda_err.cuh
+ * @author Jiannan Tian
+ * @brief CUDA runtime error handling macros.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on: 2019-10-08
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <cstdio>
+
+// block cusparse for generic testing
+#ifndef noCUSPARSE
+
+static void check_cusparse_error(cusparseStatus_t status, const char* file, int line)
+{
+    if (CUSPARSE_STATUS_SUCCESS != status) {
+        printf("\nCUSPARSE status reference (as of CUDA 11):\n");
+        printf("CUSPARSE_STATUS_SUCCESS                   -> %d\n", CUSPARSE_STATUS_SUCCESS);
+        printf("CUSPARSE_STATUS_NOT_INITIALIZED           -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED);
+        printf("CUSPARSE_STATUS_ALLOC_FAILED              -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED);
+        printf("CUSPARSE_STATUS_INVALID_VALUE             -> %d\n", CUSPARSE_STATUS_INVALID_VALUE);
+        printf("CUSPARSE_STATUS_ARCH_MISMATCH             -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH);
+        printf("CUSPARSE_STATUS_EXECUTION_FAILED          -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED);
+        printf("CUSPARSE_STATUS_INTERNAL_ERROR            -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR);
+        printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+#if (CUDART_VERSION == 1010)
+        printf("CUSPARSE_STATUS_NOT_SUPPORTED             -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+#if (CUDART_VERSION == 1100)
+        printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#endif
+        printf("\n");
+
+#if (CUDART_VERSION >= 1010)
+        printf(
+            "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line,
+            cusparseGetErrorString(status), status);
+#endif
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__))
+
+#endif
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/format.hh b/qtensor/compression/cusz/include/utils/format.hh
new file mode 100644
index 00000000..ae1d6079
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/format.hh
@@ -0,0 +1,57 @@
+#ifndef UTILS_FORMAT_HH
+#define UTILS_FORMAT_HH
+
+/**
+ * @file format.hh
+ * @author Jiannan Tian
+ * @brief Formatting for log print (header).
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-04-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+const std::string LOG_NULL      = "      ";
+const std::string LOG_INFO      = "  ::  ";
+const std::string LOG_ERR       = " ERR  ";
+const std::string LOG_WARN      = "WARN  ";
+const std::string LOG_DBG       = " dbg  ";
+const std::string LOG_EXCEPTION = "  !!  ";
+
+// https://stackoverflow.com/a/26080768/8740097  CC BY-SA 3.0
+template <typename T>
+void build(std::ostream& o, T t)
+{
+    o << t << " ";
+}
+
+template <typename T, typename... Args>
+void build(std::ostream& o, T t, Args... args)  // recursive variadic function
+{
+    build(o, t);
+    build(o, args...);
+}
+
+template <typename... Args>
+void LOGGING(const std::string& log_head, Args... args)
+{
+    std::ostringstream oss;
+    oss << log_head;
+    build(oss, args...);
+
+    oss.seekp(0, std::ios::end);
+    std::stringstream::pos_type offset = oss.tellp();
+    if (log_head == LOG_DBG) { std::cout << "\e[2m"; }  // dbg
+    std::cout << oss.str() << std::endl;                // print content
+    if (log_head == LOG_DBG) std::cout << "\e[0m";      // finish printing dbg
+}
+
+#endif  // FORMAT_HH
diff --git a/qtensor/compression/cusz/include/utils/io.hh b/qtensor/compression/cusz/include/utils/io.hh
new file mode 100644
index 00000000..574432ef
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/io.hh
@@ -0,0 +1,59 @@
+#ifndef UTILS_IO_HH
+#define UTILS_IO_HH
+
+/**
+ * @file io.hh
+ * @author Jiannan Tian
+ * @brief Read and write binary.
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2019-08-27
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+
+namespace io {
+
+template <typename T>
+T* read_binary_to_new_array(const std::string& fname, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    auto _a = new T[dtype_len]();
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+    return _a;
+}
+
+template <typename T>
+void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len)
+{
+    std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in);
+    if (not ifs.is_open()) {
+        std::cerr << "fail to open " << fname << std::endl;
+        exit(1);
+    }
+    ifs.read(reinterpret_cast<char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ifs.close();
+}
+
+template <typename T>
+void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len)
+{
+    std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out);
+    if (not ofs.is_open()) return;
+    ofs.write(reinterpret_cast<const char*>(_a), std::streamsize(dtype_len * sizeof(T)));
+    ofs.close();
+}
+
+}  // namespace io
+
+#endif  // IO_HH
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.h b/qtensor/compression/cusz/include/utils/print_gpu.h
new file mode 100644
index 00000000..d4cded5e
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/print_gpu.h
@@ -0,0 +1,45 @@
+/**
+ * @file print.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-28
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6
+#define E02AE628_9C8A_4100_8C73_A3B74B7128F6
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset);
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset);
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset);
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset);
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset);
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */
diff --git a/qtensor/compression/cusz/include/utils/print_gpu.hh b/qtensor/compression/cusz/include/utils/print_gpu.hh
new file mode 100644
index 00000000..c3236f62
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/print_gpu.hh
@@ -0,0 +1,21 @@
+/**
+ * @file print_gpu.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "print_gpu.h"
+
+namespace psz {
+
+template <typename T>
+void peek_device_data(T* d_arr, size_t num, size_t offset = 0);
+
+}  // namespace psz
+
+#undef PEEK_DEVICE_DATA
diff --git a/qtensor/compression/cusz/include/utils/strhelper.hh b/qtensor/compression/cusz/include/utils/strhelper.hh
new file mode 100644
index 00000000..a95dc96f
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/strhelper.hh
@@ -0,0 +1,144 @@
+/**
+ * @file strhelper.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-19
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_UTILS_STRHELPER_HH
+#define CUSZ_UTILS_STRHELPER_HH
+
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "format.hh"
+
+using std::cerr;
+using std::endl;
+
+using ss_t     = std::stringstream;
+using map_t    = std::unordered_map<std::string, std::string>;
+using str_list = std::vector<std::string>;
+
+struct StrHelper {
+    static unsigned int str2int(const char* s)
+    {
+        char* end;
+        auto  res = std::strtol(s, &end, 10);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static unsigned int str2int(std::string s) { return str2int(s.c_str()); }
+
+    static double str2fp(const char* s)
+    {
+        char* end;
+        auto  res = std::strtod(s, &end);
+        if (*end) {
+            const char* notif = "invalid option value, non-convertible part: ";
+            cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl;
+        }
+        return res;
+    }
+
+    static double str2fp(std::string s) { return str2fp(s.c_str()); }
+
+    static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; }
+
+    static std::pair<std::string, std::string> separate_kv(std::string& s)
+    {
+        std::string delimiter = "=";
+
+        if (s.find(delimiter) == std::string::npos)
+            throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m");
+
+        std::string k = s.substr(0, s.find(delimiter));
+        std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos);
+
+        return std::make_pair(k, v);
+    }
+
+    static void parse_strlist_as_kv(const char* in_str, map_t& kv_list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            kv_list.insert(separate_kv(tmp));
+        }
+    }
+
+    static void parse_strlist(const char* in_str, str_list& list)
+    {
+        ss_t ss(in_str);
+        while (ss.good()) {
+            std::string tmp;
+            std::getline(ss, tmp, ',');
+            list.push_back(tmp);
+        }
+    }
+
+    static std::pair<std::string, bool> parse_kv_onoff(std::string in_str)
+    {
+        auto       kv_literal = "(.*?)=(on|ON|off|OFF)";
+        std::regex kv_pattern(kv_literal);
+        std::regex onoff_pattern("on|ON|off|OFF");
+
+        bool        onoff = false;
+        std::string k, v;
+
+        std::smatch kv_match;
+        if (std::regex_match(in_str, kv_match, kv_pattern)) {
+            // the 1st match: whole string
+            // the 2nd: k, the 3rd: v
+            if (kv_match.size() == 3) {
+                k = kv_match[1].str(), v = kv_match[2].str();
+
+                std::smatch v_match;
+                if (std::regex_match(v, v_match, onoff_pattern)) {  //
+                    onoff = (v == "on") or (v == "ON");
+                }
+                else {
+                    throw std::runtime_error("not legal (k=v)-syntax");
+                }
+            }
+        }
+        return std::make_pair(k, onoff);
+    }
+
+    static std::string doc_format(const std::string& s)
+    {
+        std::regex  gray("%(.*?)%");
+        std::string gray_text("\e[37m$1\e[0m");
+
+        std::regex  bful("@(.*?)@");
+        std::string bful_text("\e[1m\e[4m$1\e[0m");
+        std::regex  bf("\\*(.*?)\\*");
+        std::string bf_text("\e[1m$1\e[0m");
+        std::regex  ul(R"(_((\w|-|\d|\.)+?)_)");
+        std::string ul_text("\e[4m$1\e[0m");
+        std::regex  red(R"(\^\^(.*?)\^\^)");
+        std::string red_text("\e[31m$1\e[0m");
+
+        auto a = std::regex_replace(s, bful, bful_text);
+        auto b = std::regex_replace(a, bf, bf_text);
+        auto c = std::regex_replace(b, ul, ul_text);
+        auto d = std::regex_replace(c, red, red_text);
+        auto e = std::regex_replace(d, gray, gray_text);
+
+        return e;
+    }
+};
+
+#endif
diff --git a/qtensor/compression/cusz/include/utils/timer.h b/qtensor/compression/cusz/include/utils/timer.h
new file mode 100644
index 00000000..41efb730
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/timer.h
@@ -0,0 +1,92 @@
+/**
+ * @file timer.h
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../cusz/type.h"
+
+struct asz_timer;
+typedef struct asz_timer asz_timer;
+typedef struct asz_timer asz_cputimer;
+
+struct asz_cudatimer;
+typedef struct asz_cudatimer asz_cudatimer;
+
+// top-level/dispatcher
+// asz_timer* asz_timer_create(asz_policy const p, void* stream);
+// void       asz_timer_destroy(asz_timer* t);
+// void       asz_timer_start(asz_timer* t);
+// void       asz_timer_end(asz_timer* t);
+// double     asz_time_elapsed(asz_timer* t);
+
+asz_timer* asz_cputimer_create();
+void       asz_cputimer_destroy(asz_timer* t);
+void       asz_cputimer_start(asz_timer* t);
+void       asz_cputimer_end(asz_timer* t);
+double     asz_cputime_elapsed(asz_timer* t);
+
+// 22-11-01 adding wrapper incurs unexpeted overhead in timing
+asz_cudatimer* asz_cudatimer_create();
+void           asz_cudatimer_destroy(asz_cudatimer* t);
+void           asz_cudatimer_start(asz_cudatimer* t);
+void           asz_cudatimer_end(asz_cudatimer* t);
+double         asz_cudatime_elapsed(asz_cudatimer* t);
+
+asz_cudatimer* asz_cudastreamtimer_create(void* stream);
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t);
+void           asz_cudastreamtimer_start(asz_cudatimer* t);
+void           asz_cudastreamtimer_end(asz_cudatimer* t);
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t);
+
+// 22-11-01 CUDA timing snippet instead
+#define CREATE_CUDAEVENT_PAIR \
+    cudaEvent_t a, b;         \
+    cudaEventCreate(&a);      \
+    cudaEventCreate(&b);
+
+#define DESTROY_CUDAEVENT_PAIR \
+    cudaEventDestroy(a);       \
+    cudaEventDestroy(b);
+
+#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM);
+#define STOP_CUDAEVENT_RECORDING(STREAM) \
+    cudaEventRecord(b, STREAM);          \
+    cudaEventSynchronize(b);
+
+#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b);
+
+// 22-11-01 HIP timing snippet instead
+#define CREATE_HIPEVENT_PAIR \
+    hipEvent_t a, b;         \
+    hipEventCreate(&a);      \
+    hipEventCreate(&b);
+
+#define DESTROY_HIPEVENT_PAIR \
+    hipEventDestroy(a);       \
+    hipEventDestroy(b);
+
+#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM);
+#define STOP_HIPEVENT_RECORDING(STREAM) \
+    hipEventRecord(b, STREAM);          \
+    hipEventSynchronize(b);
+
+#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */
diff --git a/qtensor/compression/cusz/include/utils/timer.hh b/qtensor/compression/cusz/include/utils/timer.hh
new file mode 100644
index 00000000..c820d451
--- /dev/null
+++ b/qtensor/compression/cusz/include/utils/timer.hh
@@ -0,0 +1,153 @@
+/**
+ * @file timer.hh
+ * @author Jiannan Tian
+ * @brief High-resolution timer wrapper from <chrono> and util functions for timing both CPU and CUDA function
+ * @version 0.2
+ * @date 2021-01-05
+ * (created) 2019-08-26 (rev) 2021-12-23
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef UTILS_TIMER_HH
+#define UTILS_TIMER_HH
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+typedef struct Timer {
+    hires_clock_t start, end;
+
+    void   timer_start() { start = hires::now(); }
+    void   timer_end() { end = hires::now(); }
+    double get_time_elapsed() { return static_cast<duration_t>(end - start).count(); }
+
+} host_timer_t;
+
+#ifdef __CUDACC__
+
+/**
+ * @brief CUDA event based timer. Synopsis:
+ * cuda_timer_t t;
+ * t.timer_start();
+ * kernel<<<grid_dim, block_dim, nbytes, stream>>>(...);
+ * t.timer_end();
+ * cudaStreamSynchronize(stream);
+ * auto ms = t.get_time_elapsed();
+ *
+ */
+typedef struct CUDATimer {
+    cudaEvent_t start, stop;
+    float       milliseconds;
+
+    // stream not involved
+    void timer_start()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord(start);
+    }
+
+    void timer_end()
+    {
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+    }
+
+    // stream involved
+    void timer_start(cudaStream_t stream)
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+
+        cudaEventRecord(start, stream);  // set event as not occurred
+    }
+
+    void timer_end(cudaStream_t stream)
+    {
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float get_time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        return milliseconds;
+    }
+
+} cuda_timer_t;
+
+#endif
+
+// TODO handle return; testing
+/**
+ * @brief A timer wrapper for arbitrary function (no handling return for now);
+ * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0)
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func non-return function to be timed
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+double TimeThisRoutine(F func, Args&&... args)
+{
+    auto t0 = hires::now();
+    func(std::forward<Args>(args)...);
+    return static_cast<duration_t>(hires::now() - t0).count();
+}
+
+#ifdef __CUDACC__
+typedef struct CUDAKernelConfig {
+    dim3         dim_grid;
+    dim3         dim_block;
+    size_t       shmem_nbyte{0};
+    cudaStream_t stream;
+
+} kernelcfg;
+
+// TODO use cudaEvent
+/**
+ * @brief A timer wrapper for arbitrary CUDA function
+ *
+ * @tparam F auto function type
+ * @tparam Args variadic function argument type
+ * @param func CUDA kernel function to be time
+ * @param cfg CUDA kernel config
+ * @param args variadic function arguments
+ * @return double time in seconds
+ */
+template <typename F, typename... Args>
+float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args)
+{
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start);
+    func<<<cfg.dim_grid, cfg.dim_block, cfg.shmem_nbyte, cfg.stream>>>(  //
+        args...
+        // std::forward<Args>(args)... // also works
+    );
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+
+    cudaStreamSynchronize(cfg.stream);
+
+    float milliseconds;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    return milliseconds;
+}
+
+#endif
+
+#endif  // UTILS_TIMER_HH
diff --git a/qtensor/compression/cusz/src/cli/cli.cu b/qtensor/compression/cusz/src/cli/cli.cu
new file mode 100644
index 00000000..64084cba
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/cli.cu
@@ -0,0 +1,14 @@
+/**
+ * @file cli.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-07
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cli.cuh"
+
+template class cusz::CLI<float>;
diff --git a/qtensor/compression/cusz/src/cli/cli.cuh b/qtensor/compression/cusz/src/cli/cli.cuh
new file mode 100644
index 00000000..14a9103d
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/cli.cuh
@@ -0,0 +1,195 @@
+/**
+ * @file cli.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-02-20
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CLI_CUH
+#define CLI_CUH
+
+#include <string>
+#include <type_traits>
+
+#include "cli/analyzer.hh"
+#include "cli/dryrun_part.cuh"
+#include "cli/query.hh"
+#include "cli/timerecord_viewer.hh"
+#include "cusz.h"
+#include "framework.hh"
+
+namespace cusz {
+
+template <typename Data = float>
+class CLI {
+   private:
+    using Header = cuszHEADER;
+    using T      = Data;
+
+    const static auto HOST        = cusz::LOC::HOST;
+    const static auto DEVICE      = cusz::LOC::DEVICE;
+    const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+    using context_t = cuszCTX*;
+    using header_t  = cuszHEADER*;
+
+   public:
+    CLI() = default;
+
+    template <class Predictor>
+    static void cli_dryrun(context_t ctx, bool dualquant = true)
+    {
+        BaseCompressor<Predictor> analysis;
+
+        uint3        xyz{ctx->x, ctx->y, ctx->z};
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+
+        if (not dualquant) {
+            analysis.init_dualquant_dryrun(xyz);
+            analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream);
+            analysis.destroy_dualquant_dryrun();
+        }
+        else {
+            analysis.init_generic_dryrun(xyz);
+            analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream);
+            analysis.destroy_generic_dryrun();
+        }
+        cudaStreamDestroy(stream);
+    }
+
+   private:
+    void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len)
+    {
+        Capsule<BYTE> file("cusza");
+        file.set_len(compressed_len)
+            .set_dptr(compressed)
+            .mallochost()
+            .device2host()
+            .tofile(compressed_name)
+            .freehost()
+            .free();
+    }
+
+    void try_write_decompressed_to_disk(Capsule<T>& xdata, std::string basename, bool skip_write)
+    {
+        if (not skip_write) xdata.device2host().tofile(basename + ".cuszx");
+    }
+
+    // template <typename compressor_t>
+    void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<T> input("uncompressed");
+        BYTE*      compressed;
+        size_t     compressed_len;
+        Header     header;
+        auto       len      = ctx->get_len();
+        auto       basename = ctx->fname.fname;
+
+        auto load_uncompressed = [&](std::string fname) {
+            input
+                .set_len(len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(fname)
+                .host2device();
+        };
+
+        auto adjust_eb = [&]() {
+            if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng();
+        };
+
+        /******************************************************************************/
+
+        load_uncompressed(basename);
+        adjust_eb();
+
+        TimeRecord timerecord;
+
+        cusz_config* config     = new cusz_config{.eb = ctx->eb, .mode = Rel};
+        cusz_len     uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1};
+
+        cusz_compress(
+            compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord,
+            stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len);
+        write_compressed_to_disk(basename + ".cusza", compressed, compressed_len);
+    }
+
+    // template <typename compressor_t>
+    void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream)
+    {
+        Capsule<BYTE> compressed("compressed");
+        Capsule<T>    decompressed("decompressed"), original("cmp");
+        auto          header   = new Header;
+        auto          basename = (*ctx).fname.fname;
+
+        auto load_compressed = [&](std::string compressed_name) {
+            auto compressed_len = ConfigHelper::get_filesize(compressed_name);
+            compressed
+                .set_len(compressed_len)  //
+                .mallochost()
+                .malloc()
+                .fromfile(compressed_name)
+                .host2device();
+        };
+
+        /******************************************************************************/
+
+        load_compressed(basename + ".cusza");
+        memcpy(header, compressed.hptr(), sizeof(Header));
+        auto len = ConfigHelper::get_uncompressed_len(header);
+
+        decompressed  //
+            .set_len(len)
+            .mallochost()
+            .malloc();
+        original.set_len(len);
+
+        TimeRecord timerecord;
+
+        cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1};
+
+        cusz_decompress(
+            compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len,
+            (void*)&timerecord, stream);
+
+        if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte());
+        QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp);
+        try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk);
+
+        decompressed.freehost().free();
+    }
+
+   public:
+    // TODO determine dtype & predictor in here
+    void dispatch(context_t ctx)
+    {
+        // TODO disable predictor selection; to specify in another way
+        // auto predictor = (*ctx).predictor;
+
+        cusz_framework*  framework  = cusz_default_framework();
+        cusz_compressor* compressor = cusz_create(framework, FP32);
+
+        cudaStream_t stream;
+        CHECK_CUDA(cudaStreamCreate(&stream));
+
+        // TODO hardcoded predictor type
+        if ((*ctx).cli_task.dryrun) cli_dryrun<typename Framework<float>::Predictor>(ctx);
+
+        if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream);
+
+        if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream);
+
+        if (stream) cudaStreamDestroy(stream);
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cu b/qtensor/compression/cusz/src/cli/dryrun_part.cu
new file mode 100644
index 00000000..c3a8a1c4
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cu
@@ -0,0 +1,17 @@
+/**
+ * @file base_compressor.cu
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "dryrun_part.cuh"
+
+template class cusz::BaseCompressor<cusz::PredictionUnified<  //
+    DataTrait<4>::type,
+    ErrCtrlTrait<2>::type,
+    FastLowPrecisionTrait<true>::type>>;
diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cuh b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
new file mode 100644
index 00000000..0013e790
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli/dryrun_part.cuh
@@ -0,0 +1,196 @@
+/**
+ * @file base_compressor.cuh
+ * @author Jiannan Tian
+ * @brief Predictor-only Base Compressor; can also be used for dryrun.
+ * @version 0.3
+ * @date 2021-10-05
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef BASE_COMPRESSOR_CUH
+#define BASE_COMPRESSOR_CUH
+
+#include "cli/analyzer.hh"
+#include "cli/quality_viewer.hh"
+#include "cli/verify.hh"
+#include "common.hh"
+#include "component.hh"
+#include "context.hh"
+#include "kernel/dryrun.cuh"
+#include "stat/compare_gpu.hh"
+#include "utils.hh"
+
+/**
+ * @brief bare metal, can run predictor to check data quality and compressibility
+ *
+ * @tparam T for data type
+ * @tparam E for error control type
+ */
+
+namespace cusz {
+
+template <class Predictor>
+class BaseCompressor {
+   public:
+    using BYTE = uint8_t;
+    using T    = typename Predictor::Origin;
+    using FP   = typename Predictor::Precision;
+    using E    = typename Predictor::ErrCtrl;
+
+   private:
+    struct NonCritical {
+        Predictor* p;
+        Capsule<T> original;
+        Capsule<E> errctrl;  // TODO change to 4-byte
+        Capsule<T> outlier;
+        Capsule<T> anchor;
+        Capsule<T> reconst;
+
+        NonCritical(dim3 size) { p = new Predictor; }
+    };
+
+    struct NonCritical* nc;
+
+   protected:
+    cuszCTX* ctx;
+
+    int    dict_size;
+    double eb;
+
+    dim3 xyz;
+
+   public:
+    /**
+     * @brief Generic dryrun; performing predictor.construct() and .reconstruct()
+     *
+     * @param fname filename
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param radius (host variable) limiting radius
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream)
+    {
+        if (not nc) throw std::runtime_error("NonCritical struct has no instance.");
+
+        // LOGGING(LOG_INFO, "invoke dry-run");
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto xyz = dim3(ctx->x, ctx->y, ctx->z);
+
+        // nc->p->construct(
+        //     LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius,
+        //     stream);
+        // nc->p->reconstruct(
+        //     LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius,
+        //     stream);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality<T>(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data());
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+    /**
+     * @brief Dual-quant dryrun; performing integerization & its reverse procedure
+     *
+     * @param eb (host variable) error bound; future: absolute error bound only
+     * @param r2r if relative-to-value-range
+     * @param stream CUDA stream
+     * @return BaseCompressor& this object instance
+     */
+    BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream)
+    {
+        auto len = nc->original.len();
+
+        nc->original.fromfile(fname).host2device_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        if (r2r) {
+            double max, min, rng;
+            nc->original.prescan(max, min, rng);
+            eb *= rng;
+        }
+
+        auto ebx2_r = 1 / (eb * 2);
+        auto ebx2   = eb * 2;
+
+        cusz::dualquant_dryrun_kernel                                              //
+            <<<ConfigHelper::get_npart(len, 256), 256, 256 * sizeof(T), stream>>>  //
+            (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2);
+
+        nc->reconst.device2host_async(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        cusz_stats stat;
+        psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len);
+        cusz::QualityViewer::print_metrics_cross<T>(&stat, 0, true);
+
+        return *this;
+    }
+
+   public:
+    BaseCompressor() = default;
+
+    ~BaseCompressor() {}
+
+   public:
+    // dry run
+    void init_generic_dryrun(dim3 size)
+    {  //
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+
+        nc->original.set_len(len).mallochost().malloc();
+        nc->outlier.set_len(len).mallochost().malloc();
+        nc->errctrl.set_len(len).mallochost().malloc();
+        nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_generic_dryrun()
+    {
+        delete nc->p;
+        nc->original.freehost().free();
+        nc->outlier.freehost().free();
+        nc->errctrl.freehost().free();
+        nc->anchor.freehost().free();
+        nc->reconst.freehost().free();
+        delete nc;
+    }
+
+    void init_dualquant_dryrun(dim3 size)
+    {
+        auto len = size.x * size.y * size.z;
+        nc       = new struct NonCritical(size);
+        nc->original.set_len(len).mallochost().malloc();
+        nc->reconst.set_len(len).mallochost().malloc();
+    }
+
+    void destroy_dualquant_dryrun()
+    {
+        nc->original.freehost().free();
+        nc->reconst.freehost().free();
+
+        delete nc;
+    }
+};
+
+}  // namespace cusz
+
+#endif
diff --git a/qtensor/compression/cusz/src/cli_bin.cu b/qtensor/compression/cusz/src/cli_bin.cu
new file mode 100644
index 00000000..c59c00f9
--- /dev/null
+++ b/qtensor/compression/cusz/src/cli_bin.cu
@@ -0,0 +1,27 @@
+/**
+ * @file cusz-cli.cu
+ * @author Jiannan Tian
+ * @brief Driver program of cuSZ.
+ * @version 0.1
+ * @date 2020-09-20
+ * (created) 2019-12-30 (rev) 2022-02-20
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "cli/cli.cuh"
+
+int main(int argc, char** argv)
+{
+    auto ctx = new cuszCTX(argc, argv);
+
+    if (ctx->verbose) {
+        Diagnostics::GetMachineProperties();
+        GpuDiagnostics::GetDeviceProperty();
+    }
+
+    cusz::CLI<float> cusz_cli;
+    cusz_cli.dispatch(ctx);
+}
diff --git a/qtensor/compression/cusz/src/compressor.cc b/qtensor/compression/cusz/src/compressor.cc
new file mode 100644
index 00000000..7482293b
--- /dev/null
+++ b/qtensor/compression/cusz/src/compressor.cc
@@ -0,0 +1,149 @@
+/**
+ * @file compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace cusz {
+
+template <class B>
+Compressor<B>::~Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+Compressor<B>::Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+Compressor<B>::Compressor(const Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(const Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+Compressor<B>::Compressor(Compressor<B>&&) = default;
+
+template <class B>
+Compressor<B>& Compressor<B>::operator=(Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void Compressor<B>::init(Context* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::init(Header* config, bool dbg_print)
+{
+    pimpl->init(config, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::compress(
+    Context*          config,
+    Compressor<B>::T* uncompressed,
+    BYTE*&            compressed,
+    size_t&           compressed_len,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::decompress(
+    Header*           config,
+    BYTE*             compressed,
+    Compressor<B>::T* decompressed,
+    cudaStream_t      stream,
+    bool              dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+template <class B>
+void Compressor<B>::clear_buffer()
+{
+    pimpl->clear_buffer();
+}
+
+// getter
+
+template <class B>
+void Compressor<B>::export_header(Header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_header(Header* header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+{
+    pimpl->export_timerecord(ext_timerecord);
+}
+
+}  // namespace cusz
+
+// extra helper
+namespace cusz {
+
+int CompressorHelper::autotune_coarse_parvle(Context* ctx)
+{
+    auto tune_coarse_huffman_sublen = [](size_t len) {
+        int current_dev = 0;
+        cudaSetDevice(current_dev);
+        cudaDeviceProp dev_prop{};
+        cudaGetDeviceProperties(&dev_prop, current_dev);
+
+        auto nSM               = dev_prop.multiProcessorCount;
+        auto allowed_block_dim = dev_prop.maxThreadsPerBlock;
+        auto deflate_nthread   = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT;
+        auto optimal_sublen    = ConfigHelper::get_npart(len, deflate_nthread);
+        optimal_sublen         = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) *
+                         HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+        return optimal_sublen;
+    };
+
+    auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) {
+        sublen = tune_coarse_huffman_sublen(len);
+        pardeg = ConfigHelper::get_npart(len, sublen);
+    };
+
+    // TODO should be move to somewhere else, e.g., cusz::par_optmizer
+    if (ctx->use.autotune_vle_pardeg)
+        get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg);
+    else
+        ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen);
+
+    return ctx->vle_pardeg;
+}
+
+}  // namespace cusz
+
+template class cusz::Compressor<cusz::Framework<float>>;
diff --git a/qtensor/compression/cusz/src/context.cc b/qtensor/compression/cusz/src/context.cc
new file mode 100644
index 00000000..3356323b
--- /dev/null
+++ b/qtensor/compression/cusz/src/context.cc
@@ -0,0 +1,493 @@
+/**
+ * @file argparse.cc
+ * @author Jiannan Tian
+ * @brief Argument parser.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on: 20-04-24
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <set>
+#include <stdexcept>
+#include <unordered_map>
+
+#include "cli/document.hh"
+#include "context.hh"
+
+namespace cusz {
+const char* VERSION_TEXT  = "2023-01-23 (unstable; pre-0.4)";
+const int   VERSION       = 20230123;
+const int   COMPATIBILITY = 0;
+}  // namespace cusz
+
+namespace {
+
+void set_preprocess(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto k : opts) {
+        // TODO
+    }
+}
+
+void set_report(cusz::context_t ctx, const char* in_str)
+{
+    str_list opts;
+    StrHelper::parse_strlist(in_str, opts);
+
+    for (auto o : opts) {
+        if (StrHelper::is_kv_pair(o)) {
+            auto kv = StrHelper::parse_kv_onoff(o);
+
+            if (kv.first == "cr")
+                ctx->report.cr = kv.second;
+            else if (kv.first == "compressibility")
+                ctx->report.compressibility = kv.second;
+            else if (kv.first == "time")
+                ctx->report.time = kv.second;
+        }
+        else {
+            if (o == "cr")
+                ctx->report.cr = true;
+            else if (o == "compressibility")
+                ctx->report.compressibility = true;
+            else if (o == "time")
+                ctx->report.time = true;
+        }
+    }
+}
+
+void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false)
+{
+    map_t opts;
+    StrHelper::parse_strlist_as_kv(in_str, opts);
+
+    if (dbg_print) {
+        for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str());
+        std::cout << "\n";
+    }
+
+    std::string k, v;
+    char*       end;
+
+    auto optmatch   = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); };
+    auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; };
+
+    for (auto kv : opts) {
+        k = kv.first;
+        v = kv.second;
+
+        if (optmatch({"type", "dtype"})) {
+            ConfigHelper::check_dtype(v, false);
+            ctx->dtype = v;
+        }
+        else if (optmatch({"eb", "errorbound"})) {
+            ctx->eb = StrHelper::str2fp(v);
+        }
+        else if (optmatch({"mode"})) {
+            ConfigHelper::check_cuszmode(v, true);
+            ctx->mode = v;
+        }
+        else if (optmatch({"len", "length"})) {
+            cuszCTX::parse_input_length(v.c_str(), ctx);
+        }
+        else if (optmatch({"alloclen"})) {
+            ctx->alloclen.len = StrHelper::str2int(v);
+        }
+        else if (optmatch({"demo"})) {
+            ctx->use.predefined_demo = true;
+            ctx->demo_dataset        = std::string(v);
+            ctx->load_demo_sizes();
+        }
+        else if (optmatch({"cap", "booklen", "dictsize"})) {
+            ctx->dict_size = StrHelper::str2int(v);
+            ctx->radius    = ctx->dict_size / 2;
+        }
+        else if (optmatch({"radius"})) {
+            ctx->radius    = StrHelper::str2int(v);
+            ctx->dict_size = ctx->radius * 2;
+        }
+        else if (optmatch({"huffbyte"})) {
+            ctx->huff_bytewidth = StrHelper::str2int(v);
+            ctx->codecs_in_use  = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/;
+        }
+        else if (optmatch({"huffchunk"})) {
+            ctx->vle_sublen              = StrHelper::str2int(v);
+            ctx->use.autotune_vle_pardeg = false;
+        }
+        else if (optmatch({"predictor"})) {
+            ctx->predictor = std::string(v);
+        }
+        else if (optmatch({"codec"})) {
+            // placeholder
+        }
+        else if (optmatch({"spcodec"})) {
+            // placeholder
+        }
+        else if (optmatch({"anchor"}) and is_enabled(v)) {
+            ctx->use.anchor = true;
+        }
+        else if (optmatch({"nondestructive"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"failfast"}) and is_enabled(v)) {
+            // placeholder
+        }
+        else if (optmatch({"releaseinput"}) and is_enabled(v)) {
+            ctx->use.release_input = true;
+        }
+        else if (optmatch({"pipeline"})) {
+            ctx->pipeline = v;
+        }
+        else if (optmatch({"density"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density        = StrHelper::str2fp(v);
+            ctx->nz_density_factor = 1 / ctx->nz_density;
+        }
+        else if (optmatch({"densityfactor"})) {  // refer to `SparseMethodSetup` in `config.hh`
+            ctx->nz_density_factor = StrHelper::str2fp(v);
+            ctx->nz_density        = 1 / ctx->nz_density_factor;
+        }
+        else if (optmatch({"gpuverify"}) and is_enabled(v)) {
+            ctx->use.gpu_verify = true;
+        }
+
+        // when to enable anchor
+        if (ctx->predictor == "spline3") {
+            // unconditionally use anchor when it is spline3
+            ctx->use.anchor = true;
+        }
+    }
+}
+
+void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv)
+{
+    int i = 1;
+
+    auto check_next = [&]() {
+        if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i]));
+    };
+
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    while (i < argc) {
+        if (argv[i][0] == '-') {
+            opt = std::string(argv[i]);
+
+            if (optmatch({"-c", "--config"})) {
+                check_next();
+                set_config(ctx, argv[++i]);
+            }
+            else if (optmatch({"-R", "--report"})) {
+                check_next();
+                set_report(ctx, argv[++i]);
+            }
+            else if (optmatch({"-h", "--help"})) {
+                cusz::Context::print_doc(true);
+                exit(0);
+            }
+            else if (optmatch({"-v", "--version"})) {
+                std::cout << ">>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+                exit(0);
+            }
+            else if (optmatch({"-m", "--mode"})) {
+                check_next();
+                ctx->mode = std::string(argv[++i]);
+                if (ctx->mode == "r2r") ctx->preprocess.prescan = true;
+            }
+            else if (optmatch({"-e", "--eb", "--error-bound"})) {
+                check_next();
+                char* end;
+                ctx->eb = std::strtod(argv[++i], &end);
+            }
+            else if (optmatch({"-p", "--predictor"})) {
+                check_next();
+                ctx->predictor = std::string(argv[++i]);
+            }
+            else if (optmatch({"-c", "--codec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-s", "--spcodec"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-t", "--type", "--dtype"})) {
+                check_next();
+                std::string s = std::string(std::string(argv[++i]));
+                if (s == "f32" or s == "fp4")
+                    ctx->dtype = "f32";
+                else if (s == "f64" or s == "fp8")
+                    ctx->dtype = "f64";
+            }
+            else if (optmatch({"-i", "--input"})) {
+                check_next();
+                ctx->fname.fname = std::string(argv[++i]);
+            }
+            else if (optmatch({"-l", "--len"})) {
+                check_next();
+                cusz::Context::parse_input_length(argv[++i], ctx);
+            }
+            else if (optmatch({"-L", "--allocation-len"})) {
+                check_next();
+                // placeholder
+            }
+            else if (optmatch({"-z", "--zip", "--compress"})) {
+                ctx->cli_task.construct = true;
+            }
+            else if (optmatch({"-x", "--unzip", "--decompress"})) {
+                ctx->cli_task.reconstruct = true;
+            }
+            else if (optmatch({"-r", "--dry-run"})) {
+                ctx->cli_task.dryrun = true;
+            }
+            else if (optmatch({"--anchor"})) {
+                ctx->use.anchor = true;
+            }
+            else if (optmatch({"--nondestructive", "--input-nondestructive"})) {
+                // placeholder
+            }
+            else if (optmatch({"--failfast"})) {
+                // placeholder
+            }
+            else if (optmatch({"-P", "--pre", "--preprocess"})) {
+                check_next();
+                std::string pre(argv[++i]);
+                if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; }
+            }
+            else if (optmatch({"-T", "--post", "--postprocess"})) {
+                check_next();
+                std::string post(argv[++i]);
+                if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; }
+                if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; }
+            }
+            else if (optmatch({"-V", "--verbose"})) {
+                ctx->verbose = true;
+            }
+            else if (optmatch({"--pipeline"})) {
+                check_next();
+                ctx->pipeline = std::string(argv[++i]);
+            }
+            else if (optmatch({"--demo"})) {
+                check_next();
+                ctx->use.predefined_demo = true;
+                ctx->demo_dataset        = std::string(argv[++i]);
+                ctx->load_demo_sizes();
+            }
+            else if (optmatch({"-S", "-X", "--skip", "--exclude"})) {
+                check_next();
+                std::string exclude(argv[++i]);
+                if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; }
+                if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; }
+            }
+            else if (optmatch({"--opath"})) {
+                check_next();
+                ctx->opath = std::string(argv[++i]);
+            }
+            else if (optmatch({"--origin", "--compare"})) {
+                check_next();
+                ctx->fname.origin_cmp = std::string(argv[++i]);
+            }
+            else {
+                const char* notif_prefix = "invalid option value at position ";
+                char*       notif;
+                int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+                cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m"
+                     << "\n";
+                cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ');
+                cerr << "\e[1m";
+                cerr << std::string(strlen(notif), '~');
+                cerr << "\e[0m\n";
+
+                ctx->trap(-1);
+            }
+        }
+        else {
+            const char* notif_prefix = "invalid option at position ";
+            char*       notif;
+            int         size = asprintf(&notif, "%d: %s", i, argv[i]);
+            cerr << LOG_ERR << notif_prefix << "\e[1m" << notif
+                 << "\e[0m"
+                    "\n"
+                 << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ')  //
+                 << "\e[1m"                                                     //
+                 << std::string(strlen(notif), '~')                             //
+                 << "\e[0m\n";
+
+            ctx->trap(-1);
+        }
+        i++;
+    }
+}
+
+}  // namespace
+
+cuszCTX& cuszCTX::set_control_string(const char* in_str)
+{
+    set_config(this, in_str);
+    return *this;
+}
+
+void cuszCTX::load_demo_sizes()
+{
+    const std::unordered_map<std::string, std::vector<int>> dataset_entries = {
+        {std::string("hacc"), {280953867, 1, 1, 1, 1}},    {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}},
+        {std::string("cesm"), {3600, 1800, 1, 1, 2}},      {std::string("hurricane"), {500, 500, 100, 1, 3}},
+        {std::string("nyx-s"), {512, 512, 512, 1, 3}},     {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}},
+        {std::string("qmc"), {288, 69, 7935, 1, 3}},       {std::string("qmcpre"), {69, 69, 33120, 1, 3}},
+        {std::string("exafel"), {388, 59200, 1, 1, 2}},    {std::string("rtm"), {235, 849, 849, 1, 3}},
+        {std::string("parihaka"), {1168, 1126, 922, 1, 3}}};
+
+    if (not demo_dataset.empty()) {
+        auto f = dataset_entries.find(demo_dataset);
+        if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset);
+        auto demo_xyzw = f->second;
+
+        x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3];
+        ndim = demo_xyzw[4];
+    }
+    data_len = x * y * z * w;
+}
+
+void cuszCTX::trap(int _status) { this->read_args_status = _status; }
+
+void cuszCTX::validate()
+{
+    bool to_abort = false;
+    if (fname.fname.empty()) {
+        cerr << LOG_ERR << "must specify input file" << endl;
+        to_abort = true;
+    }
+
+    if (data_len == 1 and not use.predefined_demo) {
+        if (cli_task.construct or cli_task.dryrun) {
+            cerr << LOG_ERR << "wrong input size" << endl;
+            to_abort = true;
+        }
+    }
+    if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) {
+        cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl;
+        to_abort = true;
+    }
+    if (false == ConfigHelper::check_dtype(dtype, false)) {
+        if (cli_task.construct or cli_task.dryrun) {
+            std::cout << dtype << endl;
+            cerr << LOG_ERR << "must specify data type" << endl;
+            to_abort = true;
+        }
+    }
+
+    if (quant_bytewidth == 1)
+        assert(dict_size <= 256);
+    else if (quant_bytewidth == 2)
+        assert(dict_size <= 65536);
+
+    if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct   = false;
+        cli_task.reconstruct = false;
+    }
+    else if (cli_task.dryrun and cli_task.construct) {
+        cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl;
+        cerr << LOG_WARN << "dryrun only" << endl << endl;
+        cli_task.construct = false;
+    }
+    else if (cli_task.dryrun and cli_task.reconstruct) {
+        cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl;
+        cerr << LOG_WARN << "will dryrun only" << endl << endl;
+        cli_task.reconstruct = false;
+    }
+
+    if (to_abort) {
+        print_doc();
+        exit(-1);
+    }
+}
+
+cuszCTX::cuszCTX(int argc, char** const argv)
+{
+    std::string opt;
+    auto optmatch = [&](std::vector<std::string> vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); };
+
+    if (argc == 1) {
+        print_doc();
+        exit(0);
+    }
+
+    /******************************************************************************/
+    /* phase 0: parse */
+    set_from_cli_input(this, argc, argv);
+
+    // special treatment
+    if (predictor == "spline3") {
+        // unconditionally use anchor when it is spline3
+        use.anchor = true;
+    }
+
+    /******************************************************************************/
+    /* phase 1: check syntax */
+    if (read_args_status != 0) {
+        std::cout << LOG_INFO << "Exiting..." << endl;
+        // after printing ALL argument errors
+        exit(-1);
+    }
+
+    /******************************************************************************/
+    /* phase 2: check if legal */
+    validate();
+
+    /******************************************************************************/
+    /* phase 3: sort out filenames */
+    derive_fnames();
+}
+
+cuszCTX::cuszCTX(const char* in_str, bool dbg_print)
+{
+    /**
+     **  >>> syntax
+     **  comma-separated key-pairs
+     **  "key1=val1,key2=val2[,...]"
+     **
+     **  >>> example
+     **  "predictor=lorenzo,size=3600x1800"
+     **
+     **/
+
+    set_config(this, in_str, dbg_print);
+}
+
+void cuszCTX::print_doc(bool full)
+{
+    std::cout << "\n>>>>  cusz build: " << cusz::VERSION_TEXT << "\n";
+
+    if (full)
+        std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl;
+    else
+        std::cout << cusz_short_doc << std::endl;
+}
+
+void cuszCTX::derive_fnames()
+{
+    // (1) "fname"          -> "", "fname"
+    // (2) "./fname"        -> "./" "fname"
+    // (3) "/path/to/fname" -> "/path/to", "fname"
+    auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1);
+    if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.'));
+    fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1);
+
+    if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path;
+    opath += "/";
+
+    fname.path_basename   = opath + fname.basename;
+    fname.compress_output = fname.path_basename + ".cusza";
+}
diff --git a/qtensor/compression/cusz/src/cusz/custom.cc b/qtensor/compression/cusz/src/cusz/custom.cc
new file mode 100644
index 00000000..6717e842
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz/custom.cc
@@ -0,0 +1,34 @@
+/**
+ * @file custom.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-30
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/custom.h"
+
+extern "C" {
+
+cusz_custom_predictor     cusz_default_predictor() { return {LorenzoI, false, false}; }
+cusz_custom_quantization  cusz_default_quantization() { return {512, false}; }
+cusz_custom_codec         cusz_default_codec() { return {Huffman, true, 0.5}; }
+cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; }
+cusz_custom_spcodec       cusz_default_spcodec() { return {SparseMat, 0.2}; }
+cusz_custom_framework*    cusz_default_framework()
+{
+    return new cusz_custom_framework{
+        FP32,  // placeholder; set in another function call
+        Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(),
+        // cusz_default_spcodec(),
+        cusz_default_huffman_codec()};
+}
+
+void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; }
+void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; }
+
+// end of extern C
+}
diff --git a/qtensor/compression/cusz/src/cusz_lib.cc b/qtensor/compression/cusz/src/cusz_lib.cc
new file mode 100644
index 00000000..d6bad3c6
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_lib.cc
@@ -0,0 +1,115 @@
+/**
+ * @file cusz_lib.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-05-01
+ * (rev.1) 2023-01-29
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <stdexcept>
+
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "context.hh"
+#include "cusz.h"
+#include "cusz/custom.h"
+#include "cusz/type.h"
+#include "framework.hh"
+
+cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type)
+{
+    auto comp = new cusz_compressor{.framework = _framework, .type = _type};
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        comp->compressor = new Compressor();
+    }
+    else {
+        throw std::runtime_error("Type is not supported.");
+    }
+
+    return comp;
+}
+
+cusz_error_status cusz_release(cusz_compressor* comp)
+{
+    delete comp;
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_compress(
+    cusz_compressor* comp,
+    cusz_config*     config,
+    void*            uncompressed,
+    cusz_len const   uncomp_len,
+    uint8_t**        compressed,
+    size_t*          comp_bytes,
+    cusz_header*     header,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    auto context = new cusz_context();
+    (*context)
+        .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w)
+        .set_eb(config->eb)
+        .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs");
+
+    // Be cautious of autotuning! The default value of pardeg is not robust.
+    cusz::CompressorHelper::autotune_coarse_parvle(static_cast<cusz_context*>(context));
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        // TODO add memlen & datalen comparison
+        static_cast<Compressor*>(comp->compressor)->init(context);
+        static_cast<Compressor*>(comp->compressor)
+            ->compress(context, static_cast<DATA*>(uncompressed), *compressed, *comp_bytes, stream);
+        static_cast<Compressor*>(comp->compressor)->export_header(*header);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+cusz_error_status cusz_decompress(
+    cusz_compressor* comp,
+    cusz_header*     header,
+    uint8_t*         compressed,
+    size_t const     comp_len,
+    void*            decompressed,
+    cusz_len const   decomp_len,
+    void*            record,
+    cudaStream_t     stream)
+{
+    // cusz::TimeRecord cpp_record;
+
+    if (comp->type == FP32) {
+        using DATA       = float;
+        using Compressor = cusz::CompressorFP32;
+
+        static_cast<Compressor*>(comp->compressor)->init(header);
+        static_cast<Compressor*>(comp->compressor)
+            ->decompress(header, compressed, static_cast<DATA*>(decompressed), stream);
+        static_cast<Compressor*>(comp->compressor)->export_timerecord((cusz::TimeRecord*)record);
+    }
+    else {
+        throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported.");
+    }
+
+    return CUSZ_SUCCESS;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/cusz_version.h.in b/qtensor/compression/cusz/src/cusz_version.h.in
new file mode 100644
index 00000000..09a2d3d7
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_version.h.in
@@ -0,0 +1,3 @@
+#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@
+#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@
+#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.cu b/qtensor/compression/cusz/src/cusz_wrapper.cu
new file mode 100644
index 00000000..a9b1f760
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_wrapper.cu
@@ -0,0 +1,154 @@
+//#include "cuszx_entry.h"
+//#include "szx_defines.h"
+//#include "szx_BytesToolkit.h"
+//#include "szx_TypeManager.h"
+//#include "timingGPU.h"
+
+#include "cusz.h"
+#include "cli/quality_viewer.hh"
+#include "cli/timerecord_viewer.hh"
+#include "utils/io.hh"
+#include "utils/print_gpu.hh"
+
+// template <typename T>
+extern "C"{
+unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+{
+    /* For demo, we use 3600x1800 CESM data. */
+
+    cusz_header header;
+    uint8_t*    exposed_compressed;
+    uint8_t*    compressed;
+    size_t      compressed_len;
+
+    float *d_uncompressed, *h_uncompressed;
+    float *d_decompressed, *h_decompressed;
+
+    d_uncompressed = data;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // using default
+    // cusz_framework* framework = cusz_default_framework();
+    // alternatively
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    // Brace initializing a struct pointer is not supported by all host compilers
+    // when nvcc forwards.
+    // cusz_framework* framework = new cusz_framework{
+    //     .pipeline     = Auto,
+    //     .predictor    = cusz_custom_predictor{.type = LorenzoI},
+    //     .quantization = cusz_custom_quantization{.radius = 512},
+    //     .codec        = cusz_custom_codec{.type = Huffman}};
+
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+    cusz::TimeRecord compress_timerecord;
+    
+
+    {
+        cusz_compress(
+            comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header,
+            (void*)&compress_timerecord, stream);
+
+        /* User can interpret the collected time information in other ways. */
+        cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len);
+
+        /* verify header */
+        printf("header.%-*s : %x\n", 12, "(addr)", &header);
+        printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z);
+        printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header));
+    }
+
+    /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */
+    cudaMalloc(&compressed, compressed_len);
+    cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice);
+    cudaFree(exposed_compressed);
+    cudaStreamDestroy(stream);
+    *outSize = compressed_len;
+    return compressed;
+}
+
+float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+    cusz::TimeRecord decompress_timerecord;
+    cudaStream_t stream;
+    cusz_header header;
+    float* d_decompressed;
+    cudaMalloc(&d_decompressed, sizeof(float) * len);
+
+    cusz_framework fw = cusz_framework{
+        .pipeline     = Auto,
+        .predictor    = cusz_custom_predictor{.type = LorenzoI},
+        .quantization = cusz_custom_quantization{.radius = 512},
+        .codec        = cusz_custom_codec{.type = Huffman}};
+    cusz_framework* framework = &fw;
+
+    cusz_compressor* comp       = cusz_create(framework, FP32);
+    cusz_config*     config     = new cusz_config{.eb = r2r_error, .mode = Rel};
+    cusz_len         uncomp_len = cusz_len{len, 1, 1, 1};  // x, y, z, w
+    cusz_len         decomp_len = uncomp_len;
+
+
+    cudaStreamCreate(&stream);
+    {
+        cusz_decompress(
+            comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len,
+            (void*)&decompress_timerecord, stream);
+
+        cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float));
+    }
+
+
+    cusz_release(comp);
+
+    // cudaFree(cmpbytes);
+    cudaStreamDestroy(stream);
+    return d_decompressed;
+}
+
+
+    // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
+    //     float max,min;
+    //     unsigned char* bytes;
+    //     max = data[0];
+    //     min = data[0];
+    //     for (size_t i = 0; i < nbEle; i++)
+    //     {
+    //         if(data[i] > max) max = data[i];
+    //         if(data[i] < min) min = data[i];
+    //     }
+        
+    //     float threshold = r2r_threshold*(max-min);
+    //     float errBound = r2r_err*(max-min);
+    //     bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
+   	//     // printf("outSize %p\n", bytes);
+    //     return bytes;
+    // }
+
+    // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
+    //     // printf("test\n");
+    //     float**data;
+	//     cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
+    //     return *data;
+    // }
+
+    // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
+    //     return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
+    // }
+
+    // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
+    //     return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
+    // }
+    
+    
+}
diff --git a/qtensor/compression/cusz/src/cusz_wrapper.py b/qtensor/compression/cusz/src/cusz_wrapper.py
new file mode 100644
index 00000000..682bd3e6
--- /dev/null
+++ b/qtensor/compression/cusz/src/cusz_wrapper.py
@@ -0,0 +1,173 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so')
+CUSZ_PATH = str(Path(__file__).parent/'libcusz.so')
+# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
+
+# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize)
+
+def get_device_compress():
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *data, float r2r_error,size_t len,size_t *outSize
+    func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){
+
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cusz_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    #nbEle = len(oriData)
+    sample = oriData[::2]
+    #print(nbEle)
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    # absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting")
+    # float *data, float r2r_error,size_t len,size_t *outSize
+    o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize)
+  
+
+    return (o_bytes,outSize.contents.value, absErrBound), outSize
+
+
+def cusz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, err_bound) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+    newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound))
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cusz/src/detail/compare_cpu.inl b/qtensor/compression/cusz/src/detail/compare_cpu.inl
new file mode 100644
index 00000000..b09eb558
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compare_cpu.inl
@@ -0,0 +1,109 @@
+/**
+ * @file _compare.hh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T>
+bool cppstd_identical(T* d1, T* d2, size_t const len)
+{
+    return std::equal(d1, d1 + len, d2);
+}
+
+template <typename T>
+bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)
+{
+    // debugging
+
+    bool eb_ed = true;
+    for (size_t i = 0; i < len; i++) {
+        if (fabs(a[i] - b[i]) > 1.001 * eb) {
+            if (first_faulty_idx) *first_faulty_idx = i;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T>
+void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len)
+{
+    double max_odata = odata[0], min_odata = odata[0];
+    double max_xdata = xdata[0], min_xdata = xdata[0];
+    double max_abserr = max_abserr = fabs(xdata[0] - odata[0]);
+
+    double sum_0 = 0, sum_x = 0;
+    for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i];
+
+    double mean_odata = sum_0 / len, mean_xdata = sum_x / len;
+    double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0;
+
+    double max_pwrrel_abserr = 0;
+    size_t max_abserr_index  = 0;
+    for (size_t i = 0; i < len; i++) {
+        max_odata = max_odata < odata[i] ? odata[i] : max_odata;
+        min_odata = min_odata > odata[i] ? odata[i] : min_odata;
+
+        max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata;
+        min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata;
+
+        float abserr = fabs(xdata[i] - odata[i]);
+        if (odata[i] != 0) {
+            rel_abserr        = abserr / fabs(odata[i]);
+            max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr;
+        }
+        max_abserr_index = max_abserr < abserr ? i : max_abserr_index;
+        max_abserr       = max_abserr < abserr ? abserr : max_abserr;
+        sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata);
+        sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata);
+        sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata);
+        sum_err2 += abserr * abserr;
+    }
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    s->len = len;
+
+    s->odata.max = max_odata;
+    s->odata.min = min_odata;
+    s->odata.rng = max_odata - min_odata;
+    s->odata.std = std_odata;
+
+    s->xdata.max = max_xdata;
+    s->xdata.min = min_xdata;
+    s->xdata.rng = max_xdata - min_xdata;
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = max_pwrrel_abserr;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */
diff --git a/qtensor/compression/cusz/src/detail/compare_gpu.inl b/qtensor/compression/cusz/src/detail/compare_gpu.inl
new file mode 100644
index 00000000..851fc4a2
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compare_gpu.inl
@@ -0,0 +1,193 @@
+/**
+ * @file _compare.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-08
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4
+
+#include <math.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/tuple.h>
+
+#include "cusz/type.h"
+
+namespace psz {
+namespace detail {
+
+static const int MINVAL = 0;
+static const int MAXVAL = 1;
+static const int AVGVAL = 2;
+static const int RNG    = 3;
+
+template <typename T>
+bool thrustgpu_identical(T* d1, T* d2, size_t const len)
+{
+    return thrust::equal(thrust::device, d1, d1 + len, d2);
+}
+
+template <typename T>
+bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr)
+{
+    thrust::device_ptr<T>             a_ = thrust::device_pointer_cast(a);
+    thrust::device_ptr<T>             b_ = thrust::device_pointer_cast(b);
+    thrust::constant_iterator<double> eb_(eb);
+    using tup = thrust::tuple<T, T, double>;
+
+    auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_));
+    auto ab_end   = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_));
+
+    // Let compiler figure out the type.
+    auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) {
+        // debug use
+        // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t))
+        //     printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t));
+
+        return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t);
+    });
+
+    if (iter == ab_end) { return true; }
+    else {
+        // *first_faulty_idx = iter - ab_begin;
+        return false;
+    }
+}
+
+template <typename T>
+void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])
+{
+    thrust::device_ptr<T> g_ptr = thrust::device_pointer_cast(d_ptr);
+
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_extrema(thrust::device_ptr<T> g_ptr, size_t len, T res[4])
+{
+    auto minel  = thrust::min_element(g_ptr, g_ptr + len) - g_ptr;
+    auto maxel  = thrust::max_element(g_ptr, g_ptr + len) - g_ptr;
+    res[MINVAL] = *(g_ptr + minel);
+    res[MAXVAL] = *(g_ptr + maxel);
+    res[RNG]    = res[MAXVAL] - res[MINVAL];
+
+    auto sum    = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus<T>());
+    res[AVGVAL] = sum / len;
+}
+
+template <typename T>
+void thrustgpu_get_maxerr(
+    T*      reconstructed,  // in
+    T*      original,       // in
+    size_t  len,            // in
+    T&      maximum_val,    // out
+    size_t& maximum_loc,    // out
+    bool    destructive = false)
+{
+    T* diff;
+
+    if (destructive) {
+        diff = original;  // aliasing
+    }
+    else {
+        cudaMalloc(&diff, sizeof(T) * len);
+    }
+
+    auto expr = [=] __device__(T rel, T oel) { return rel - oel; };
+
+    // typesafe (also with exec-policy binding)
+    thrust::device_ptr<T> r(reconstructed);
+    thrust::device_ptr<T> o(original);
+    thrust::device_ptr<T> d(diff);
+
+    thrust::transform(r, r + len, o, d, expr);
+
+    auto maximum_ptr = thrust::max_element(d, d + len);
+    maximum_val      = *maximum_ptr;
+    maximum_loc      = maximum_ptr - d;
+
+    if (not destructive) { cudaFree(diff); }
+}
+
+template <typename T>
+void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len)
+{
+    using tup = thrust::tuple<T, T>;
+
+    thrust::device_ptr<T> p_odata = thrust::device_pointer_cast(odata);  // origin
+    thrust::device_ptr<T> p_xdata = thrust::device_pointer_cast(xdata);
+
+    T odata_res[4], xdata_res[4];
+
+    thrustgpu_get_extrema(p_odata, len, odata_res);
+    thrustgpu_get_extrema(p_xdata, len, xdata_res);
+
+    auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata));
+    auto end   = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len));
+
+    // clang-format off
+    auto corr      = [=] __host__ __device__(tup t)  { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); };
+    auto err2      = []  __host__ __device__(tup t)  { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; };
+    auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; };
+    auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; };
+
+    auto sum_err2      = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus<T>());
+    auto sum_corr      = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus<T>());
+    auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus<T>());
+    auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus<T>());
+    // clang-format on
+
+    double std_odata = sqrt(sum_var_odata / len);
+    double std_xdata = sqrt(sum_var_xdata / len);
+    double ee        = sum_corr / len;
+
+    // -----------------------------------------------------------------------------
+    T      max_abserr{0};
+    size_t max_abserr_index{0};
+    thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false);
+    // -----------------------------------------------------------------------------
+
+    s->len = len;
+
+    s->odata.max = odata_res[MAXVAL];
+    s->odata.min = odata_res[MINVAL];
+    s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL];
+    s->odata.std = std_odata;
+
+    s->xdata.max = xdata_res[MAXVAL];
+    s->xdata.min = xdata_res[MINVAL];
+    s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL];
+    s->xdata.std = std_xdata;
+
+    s->max_err.idx    = max_abserr_index;
+    s->max_err.abs    = max_abserr;
+    s->max_err.rel    = max_abserr / s->odata.rng;
+    s->max_err.pwrrel = NAN;
+
+    s->reduced.coeff = ee / std_odata / std_xdata;
+    s->reduced.MSE   = sum_err2 / len;
+    s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng;
+    s->reduced.PSNR  = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE);
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.cu b/qtensor/compression/cusz/src/detail/compressor_impl.cu
new file mode 100644
index 00000000..3974e15b
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.cu
@@ -0,0 +1,18 @@
+/**
+ * @file compressor.cu
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "compressor_impl.inl"
+#include "framework.hh"
+
+template class cusz::Compressor<cusz::Framework<float>>::impl;
diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.inl b/qtensor/compression/cusz/src/detail/compressor_impl.inl
new file mode 100644
index 00000000..46704ba6
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/compressor_impl.inl
@@ -0,0 +1,479 @@
+/**
+ * @file compressor_impl.cuh
+ * @author Jiannan Tian
+ * @brief cuSZ compressor of the default path
+ * @version 0.3
+ * @date 2021-10-05
+ * (create) 2020-02-12; (release) 2020-09-20;
+ * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_DEFAULT_PATH_CUH
+#define CUSZ_DEFAULT_PATH_CUH
+
+#include <cuda_runtime.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+#include "component.hh"
+#include "compressor.hh"
+#include "header.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr};
+#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr};
+#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR));
+#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR));
+
+#define PRINT_ENTRY(VAR) printf("%d %-*s:  %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]);
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                                                 \
+    if (nbyte[Header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = d_reserved_compressed + header.entry[Header::FIELD];                                \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                       \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[Header::SYM])
+
+namespace cusz {
+
+constexpr auto kHOST        = cusz::LOC::HOST;
+constexpr auto kDEVICE      = cusz::LOC::DEVICE;
+constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE;
+
+#define TEMPLATE_TYPE template <class BINDING>
+#define IMPL Compressor<BINDING>::impl
+
+TEMPLATE_TYPE
+uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; }
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    predictor = new Predictor;
+
+    spcodec  = new Spcodec;
+    codec    = new Codec;
+    fb_codec = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (spcodec) delete spcodec;
+    if (codec) delete codec;
+    if (fb_codec) delete codec;
+    if (predictor) delete predictor;
+}
+
+TEMPLATE_TYPE
+IMPL::~impl() { destroy(); }
+
+//------------------------------------------------------------------------------
+
+// TODO
+TEMPLATE_TYPE
+void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+TEMPLATE_TYPE
+void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); }
+
+template <class T>
+void peek_devdata(T* d_arr, size_t num = 20)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); });
+    printf("\n");
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     config,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb                = config->eb;
+    auto const radius            = config->radius;
+    auto const pardeg            = config->vle_pardeg;
+    auto const codecs_in_use     = config->codecs_in_use;
+    auto const nz_density_factor = config->nz_density_factor;
+
+    if (dbg_print) {
+        std::cout << "eb\t" << eb << endl;
+        std::cout << "radius\t" << radius << endl;
+        std::cout << "pardeg\t" << pardeg << endl;
+        std::cout << "codecs_in_use\t" << codecs_in_use << endl;
+        std::cout << "nz_density_factor\t" << nz_density_factor << endl;
+    }
+
+    data_len3                 = dim3(config->x, config->y, config->z);
+    auto codec_force_fallback = config->codec_force_fallback();
+
+    header.codecs_in_use     = codecs_in_use;
+    header.nz_density_factor = nz_density_factor;
+
+    T*     d_anchor{nullptr};   // predictor out1
+    E*     d_errctrl{nullptr};  // predictor out2
+    T*     d_outlier{nullptr};  // predictor out3
+    BYTE*  d_spfmt{nullptr};
+    size_t spfmt_outlen{0};
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    size_t data_len, errctrl_len, sublen, spcodec_inlen;
+    auto   booklen = radius * 2;
+
+    auto derive_lengths_after_prediction = [&]() {
+        data_len      = predictor->get_len_data();
+        errctrl_len   = data_len;
+        spcodec_inlen = data_len;
+        sublen        = ConfigHelper::get_npart(data_len, pardeg);
+
+        // std::cout << "datalen\t" << data_len << '\n';
+        // std::cout << "errctrl_len\t" << errctrl_len << '\n';
+        // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n';
+        // std::cout << "sublen\t" << sublen << '\n';
+    };
+
+    auto update_header = [&]() {
+        header.x          = data_len3.x;
+        header.y          = data_len3.y;
+        header.z          = data_len3.z;
+        header.w          = 1;  // placeholder
+        header.radius     = radius;
+        header.vle_pardeg = pardeg;
+        header.eb         = eb;
+        header.byte_vle   = use_fallback_codec ? 8 : 4;
+    };
+
+    /******************************************************************************/
+
+    // Prediction is the dependency of the rest procedures.
+    predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream);
+    // peek_devdata(d_errctrl);
+
+    derive_lengths_after_prediction();
+    /******************************************************************************/
+
+    asz::stat::histogram<E>(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO remove duplicate get_frequency inside encode_with_exception()
+    encode_with_exception(
+        d_errctrl, errctrl_len,                                 // input
+        d_freq, booklen, sublen, pardeg, codec_force_fallback,  // config
+        d_codec_out, codec_outlen,                              // output
+        stream, dbg_print);
+
+    (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print);
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    /******************************************************************************/
+
+    update_header();
+    subfile_collect(
+        d_anchor, (*predictor).get_len_anchor(),  //
+        d_codec_out, codec_outlen,                //
+        d_spfmt, spfmt_outlen,                    //
+        stream, dbg_print);
+
+    // output
+    compressed_len = ConfigHelper::get_filesize(&header);
+    compressed     = d_reserved_compressed;
+
+    collect_compress_timerecord();
+
+    // considering that codec can be consecutively in use, and can compress data of different huff-byte
+    use_fallback_codec = false;
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{  //
+    (*predictor).clear_buffer();
+    (*codec).clear_buffer();
+    (*spcodec).clear_buffer();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new Header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    use_fallback_codec      = header->byte_vle == 8;
+    double const eb         = header->eb;
+    int const    radius     = header->radius;
+    auto const   vle_pardeg = header->vle_pardeg;
+
+    // The inputs of components are from `compressed`.
+    auto d_anchor = ACCESSOR(ANCHOR, T);
+    auto d_vle    = ACCESSOR(VLE, BYTE);
+    auto d_sp     = ACCESSOR(SPFMT, BYTE);
+
+    // wire the workspace
+    auto d_errctrl = (*predictor).expose_quant();  // reuse space
+
+    // wire and aliasing
+    auto d_outlier       = out_decompressed;
+    auto d_outlier_xdata = out_decompressed;
+
+    auto spcodec_do            = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); };
+    auto decode_with_exception = [&]() {
+        if (not use_fallback_codec) {  //
+            (*codec).decode(d_vle, d_errctrl);
+        }
+        else {
+            if (not fallback_codec_allocated) {
+                (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false);
+                fallback_codec_allocated = true;
+            }
+            (*fb_codec).decode(d_vle, d_errctrl);
+        }
+    };
+    auto predictor_do = [&]() {
+        (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream);
+    };
+
+    // process
+    spcodec_do(), decode_with_exception(), predictor_do();
+
+    collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    use_fallback_codec = false;
+}
+
+// public getter
+TEMPLATE_TYPE
+void IMPL::export_header(Header& ext_header) { ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_header(Header* ext_header) { *ext_header = header; }
+
+TEMPLATE_TYPE
+void IMPL::export_timerecord(TimeRecord* ext_timerecord)
+{
+    if (ext_timerecord) *ext_timerecord = timerecord;
+}
+
+// helper
+TEMPLATE_TYPE
+void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print)
+{
+    if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s).");
+    if (codec_config bitand 0b01) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec");
+        (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+    }
+    if (codec_config bitand 0b10) {
+        if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec");
+        (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print);
+        fallback_codec_allocated = true;
+    }
+};
+
+TEMPLATE_TYPE
+template <class CONFIG>
+void IMPL::init_detail(CONFIG* config, bool dbg_print)
+{
+    const auto cfg_radius      = config->radius;
+    const auto cfg_pardeg      = config->vle_pardeg;
+    const auto density_factor  = config->nz_density_factor;
+    const auto codec_config    = config->codecs_in_use;
+    const auto cfg_max_booklen = cfg_radius * 2;
+    const auto x               = config->x;
+    const auto y               = config->y;
+    const auto z               = config->z;
+
+    size_t spcodec_in_len, codec_in_len;
+
+    (*predictor).init(LorenzoI, x, y, z, dbg_print);
+
+    spcodec_in_len = (*predictor).get_alloclen_data();
+    codec_in_len   = (*predictor).get_alloclen_quant();
+
+    (*spcodec).init(spcodec_in_len, density_factor, dbg_print);
+
+    {
+        auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen;
+        cudaMalloc(&d_freq, bytes);
+        cudaMemset(d_freq, 0x0, bytes);
+
+        // cudaMalloc(&d_freq_another, bytes);
+        // cudaMemset(d_freq_another, 0x0, bytes);
+    }
+
+    init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print);
+
+    CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2));
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_compress_timerecord()
+{
+#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast<const char*>(NAME), TIME});
+
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+    COLLECT_TIME("histogram", time_hist);
+
+    if (not use_fallback_codec) {
+        COLLECT_TIME("book", (*codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*codec).get_time_lossless());
+    }
+    else {
+        COLLECT_TIME("book", (*fb_codec).get_time_book());
+        COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::collect_decompress_timerecord()
+{
+    if (not timerecord.empty()) timerecord.clear();
+
+    COLLECT_TIME("outlier", (*spcodec).get_time_elapsed());
+
+    if (not use_fallback_codec) {  //
+        COLLECT_TIME("huff-dec", (*codec).get_time_lossless());
+    }
+    else {  //
+        COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless());
+    }
+
+    COLLECT_TIME("predict", (*predictor).get_time_elapsed());
+}
+
+TEMPLATE_TYPE
+void IMPL::encode_with_exception(
+    E*           d_in,
+    size_t       inlen,
+    cusz::FREQ*  d_freq,
+    int          booklen,
+    int          sublen,
+    int          pardeg,
+    bool         codec_force_fallback,
+    BYTE*&       d_out,
+    size_t&      outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); };
+    auto encode_with          = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); };
+
+    auto try_fallback_alloc = [&]() {
+        use_fallback_codec = true;
+        if (not fallback_codec_allocated) {
+            LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec");
+            fb_codec->init(inlen, booklen, pardeg, dbg_print);
+            fallback_codec_allocated = true;
+        }
+    };
+
+    /******************************************************************************/
+    if (not codec_force_fallback) {
+        try {
+            build_codebook_using(codec);
+            encode_with(codec);
+        }
+        catch (const std::runtime_error& e) {
+            LOGGING(LOG_EXCEPTION, "switch to fallback codec");
+            try_fallback_alloc();
+
+            build_codebook_using(fb_codec);
+            encode_with(fb_codec);
+        }
+    }
+    else {
+        LOGGING(LOG_INFO, "force switch to fallback codec");
+        try_fallback_alloc();
+
+        build_codebook_using(fb_codec);
+        encode_with(fb_codec);
+    }
+}
+
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    T*           d_anchor,
+    size_t       anchor_len,
+    BYTE*        d_codec_out,
+    size_t       codec_outlen,
+    BYTE*        d_spfmt_out,
+    size_t       spfmt_outlen,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    header.self_bytes = sizeof(Header);
+    uint32_t nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+    nbyte[Header::ANCHOR] = sizeof(T) * anchor_len;
+    nbyte[Header::VLE]    = sizeof(BYTE) * codec_outlen;
+    nbyte[Header::SPFMT]  = sizeof(BYTE) * spfmt_outlen;
+
+    header.entry[0] = 0;
+    // *.END + 1; need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    auto debug_header_entry = [&]() {
+        printf("\nsubfile collect in compressor:\n");
+        printf("  ENTRIES\n");
+
+        PRINT_ENTRY(HEADER);
+        PRINT_ENTRY(ANCHOR);
+        PRINT_ENTRY(VLE);
+        PRINT_ENTRY(SPFMT);
+        PRINT_ENTRY(END);
+        printf("\n");
+    };
+
+    if (dbg_print) debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    DEVICE2DEVICE_COPY(d_anchor, ANCHOR)
+    DEVICE2DEVICE_COPY(d_codec_out, VLE)
+    DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT)
+
+    /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream));
+}
+
+}  // namespace cusz
+
+#undef FREEDEV
+#undef FREEHOST
+#undef DEFINE_DEV
+#undef DEFINE_HOST
+#undef DEVICE2DEVICE_COPY
+#undef PRINT_ENTRY
+#undef ACCESSOR
+#undef COLLECT_TIME
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/detail/spmat.cu b/qtensor/compression/cusz/src/detail/spmat.cu
new file mode 100644
index 00000000..b6a95bb2
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spmat.cu
@@ -0,0 +1,14 @@
+/**
+ * @file spmat.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-09-28
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spmat.cuh"
+
+template struct cusz::SpcodecCSR<float, uint32_t>::impl;
diff --git a/qtensor/compression/cusz/src/detail/spv_gpu.inl b/qtensor/compression/cusz/src/detail/spv_gpu.inl
new file mode 100644
index 00000000..4c724bd5
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spv_gpu.inl
@@ -0,0 +1,77 @@
+/**
+ * @file spv_gpu.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-08-22
+ * (update) 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/tuple.h>
+
+#include "utils/timer.h"
+
+namespace psz {
+namespace detail {
+
+template <typename T, typename M>
+void spv_gather(
+    T*           in,
+    size_t const in_len,
+    T*           d_val,
+    uint32_t*    d_idx,
+    int*         nnz,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    using thrust::placeholders::_1;
+
+    thrust::cuda::par.on(stream);
+    thrust::counting_iterator<uint32_t> zero(0);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // find out the indices
+    *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx;
+
+    // fetch corresponding values
+    thrust::copy(
+        thrust::device, thrust::make_permutation_iterator(in, d_idx),
+        thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename M>
+void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)
+{
+    thrust::cuda::par.on(stream);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+}  // namespace detail
+}  // namespace psz
+
+#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */
diff --git a/qtensor/compression/cusz/src/detail/spvec.cu b/qtensor/compression/cusz/src/detail/spvec.cu
new file mode 100644
index 00000000..7ed562db
--- /dev/null
+++ b/qtensor/compression/cusz/src/detail/spvec.cu
@@ -0,0 +1,18 @@
+/**
+ * @file spvec.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-03-01
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/spvec.cuh"
+
+template struct cusz::SpcodecVec<float>::impl;
+template struct cusz::SpcodecVec<uint8_t>::impl;
+template struct cusz::SpcodecVec<uint16_t>::impl;
+template struct cusz::SpcodecVec<uint32_t>::impl;
+// template struct cusz::SpcodecVec<double>::impl;
diff --git a/qtensor/compression/cusz/src/experimental/Makefile b/qtensor/compression/cusz/src/experimental/Makefile
new file mode 100644
index 00000000..22807665
--- /dev/null
+++ b/qtensor/compression/cusz/src/experimental/Makefile
@@ -0,0 +1,7 @@
+altlorenzo:
+	nvcc -lineinfo -std=c++17 \
+		--extended-lambda \
+		-DDPCPP_SHOWCASE \
+		../wrapper/extrap_lorenzo.cu \
+		dpcpp_demo_lorenzo.cu \
+		-o dpcpp_demo_lorenzo
diff --git a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
new file mode 100644
index 00000000..6d5123a0
--- /dev/null
+++ b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu
@@ -0,0 +1,120 @@
+/**
+ * @file withwrapper_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief A temporary test case using high-level component/API.
+ * @version 0.3
+ * @date 2021-06-21
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include <pwd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include "../utils/io.hh"
+#include "../utils/verify.hh"
+
+#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)."
+// #include "../utils/verify_gpu.cuh"
+#include "../component/extrap_lorenzo.h"
+
+using std::cout;
+using std::endl;
+
+using Data  = float;
+using Quant = uint16_t;
+using FP    = float;
+
+Data eb;
+Data maxval, minval;
+
+// dim3   stride3;
+size_t len1;
+int    radius = 512;
+
+namespace {
+
+#ifndef __CUDACC__
+struct __dim3_compat {
+    unsigned int x, y, z;
+    __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){};
+};
+
+using dim3 = __dim3_compat;
+#endif
+
+auto get_npart = [](auto size, auto subsize) {
+    static_assert(
+        std::numeric_limits<decltype(size)>::is_integer and std::numeric_limits<decltype(subsize)>::is_integer,
+        "[get_npart] must be plain interger types.");
+    return (size + subsize - 1) / subsize;
+};
+auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; };
+auto get_stride3       = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); };
+
+}  // namespace
+
+void test_lorenzo(std::string fname, int ndim, dim3 size3)
+{
+    cout << "filename: " << fname << '\n';
+
+    Data*  h_data{nullptr};
+    Data*  d_data{nullptr};
+    Data*  h2_data{nullptr};
+    Quant* d_quant{nullptr};
+
+    auto len1 = get_len_from_dim3(size3);
+    cout << "len1 from dim3: " << len1 << endl;
+
+    cudaMallocHost(&h_data, len1 * sizeof(Data));
+    io::read_binary_to_array(fname, h_data, len1);
+    cudaMallocHost(&h2_data, len1 * sizeof(Data));
+    memcpy(h2_data, h_data, len1 * sizeof(Data));
+
+    cudaMalloc(&d_data, len1 * sizeof(Data));
+    cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice);
+    cudaMalloc(&d_quant, len1 * sizeof(Quant));
+
+    auto maxval = *std::max_element(h_data, h_data + len1);
+    auto minval = *std::min_element(h_data, h_data + len1);
+    eb          = 1e-3 * (maxval - minval);
+
+    compress_lorenzo_construct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+    decompress_lorenzo_reconstruct<Data, Quant, FP>(d_data, d_quant, size3, ndim, eb, radius);
+
+    cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost);
+
+    // TODO GPU verification does not print
+    // {
+    //     Stat stat_gpu;
+    //     verify_data_GPU(&stat_gpu, h_data, h2_data, len1);
+    //     cusz::QualityViewer::print_metrics_cross<Data>(&stat_gpu, false, eb, 0, 1, false, true);
+    // }
+    {
+        Stat stat;
+        cusz::verify_data(&stat, h_data, h2_data, len1);
+        cusz::QualityViewer::print_metrics_cross<Data>(&stat, false, eb, 0, 1, false, false);
+    }
+
+    // clear up
+    cudaFree(d_data);
+    cudaFree(d_quant);
+    cudaFreeHost(h_data);
+    cudaFreeHost(h2_data);
+}
+
+int main()
+{
+    struct passwd* pw      = getpwuid(getuid());
+    const char*    homedir = pw->pw_dir;
+
+    test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1));
+    test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100));
+
+    return 0;
+}
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
new file mode 100644
index 00000000..27890728
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl
@@ -0,0 +1,742 @@
+/**
+ * @file huffman_parbook.cu
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu)
+ * @brief Parallel Huffman Construction to generates canonical forward codebook.
+ *        Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib)
+ *        "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes".
+ * @version 0.1
+ * @date 2020-10-24
+ * (created) 2020-05 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef C883A574_4491_40E8_A083_1B6E8FB56670
+#define C883A574_4491_40E8_A083_1B6E8FB56670
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "par_merge.inl"
+#include "utils.hh"
+#include "utils/timer.h"
+
+using std::cout;
+using std::endl;
+namespace cg = cooperative_groups;
+
+// GenerateCL Locals
+__device__ int iNodesFront = 0;
+__device__ int iNodesRear  = 0;
+__device__ int lNodesCur   = 0;
+
+__device__ int iNodesSize = 0;
+__device__ int curLeavesNum;
+
+__device__ int minFreq;
+
+__device__ int tempLength;
+
+__device__ int mergeFront;
+__device__ int mergeRear;
+
+__device__ int lNodesIndex;
+
+// GenerateCW Locals
+__device__ int CCL;
+__device__ int CDPI;
+__device__ int newCDPI;
+
+// Profiling
+__device__ long long int s[10];
+__device__ long long int st[10];
+
+// Mathematically correct mod
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+namespace par_huffman {
+namespace detail {
+
+// clang-format off
+template <typename T>             __global__ void GPU_FillArraySequence(T*, unsigned int);
+template <typename T>             __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*);
+template <typename T>             __global__ void GPU_ReverseArray(T*, unsigned int);
+template <typename H, typename T> __global__ void GPU_ReorderByIndex(H*, T*, unsigned int);
+// clang-format on
+
+}  // namespace detail
+}  // namespace par_huffman
+
+namespace par_huffman {
+
+// Codeword length
+template <typename F>
+__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int);
+
+// Forward Codebook
+template <typename F, typename H>
+__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size);
+
+}  // namespace par_huffman
+
+// Parallel huffman code generation
+// clang-format off
+template <typename F>
+__global__ void par_huffman::GPU_GenerateCL(
+    F*  histogram,  F* CL,  int size,
+    /* Global Arrays */
+    F* lNodesFreq,  int* lNodesLeader,
+    F* iNodesFreq,  int* iNodesLeader,
+    F* tempFreq,    int* tempIsLeaf,    int* tempIndex,
+    F* copyFreq,    int* copyIsLeaf,    int* copyIndex,
+    uint32_t* diagonal_path_intersections, int mblocks, int mthreads)
+{
+    // clang-format on
+
+    extern __shared__ int32_t shmem[];
+    // Shared variables
+    int32_t& x_top     = shmem[0];
+    int32_t& y_top     = shmem[1];
+    int32_t& x_bottom  = shmem[2];
+    int32_t& y_bottom  = shmem[3];
+    int32_t& found     = shmem[4];
+    int32_t* oneorzero = &shmem[5];
+
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Adaptation for easier porting
+    auto               current_grid = cg::this_grid();
+
+    /* Initialization */
+    if (thread < size) {
+        lNodesLeader[i] = -1;
+        CL[i]           = 0;
+    }
+
+    if (thread == 0) {
+        iNodesFront = 0;
+        iNodesRear  = 0;
+        lNodesCur   = 0;
+
+        iNodesSize = 0;
+    }
+    current_grid.sync();
+
+    /* While there is not exactly one internal node */
+    while (lNodesCur < size || iNodesSize > 1) {
+        /* Combine two most frequent nodes on same level */
+        if (thread == 0) {
+            F   midFreq[4];
+            int midIsLeaf[4];
+            for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX;
+
+            if (lNodesCur < size) {
+                midFreq[0]   = lNodesFreq[lNodesCur];
+                midIsLeaf[0] = 1;
+            }
+            if (lNodesCur < size - 1) {
+                midFreq[1]   = lNodesFreq[lNodesCur + 1];
+                midIsLeaf[1] = 1;
+            }
+            if (iNodesSize >= 1) {
+                midFreq[2]   = iNodesFreq[iNodesFront];
+                midIsLeaf[2] = 0;
+            }
+            if (iNodesSize >= 2) {
+                midFreq[3]   = iNodesFreq[MOD(iNodesFront + 1, size)];
+                midIsLeaf[3] = 0;
+            }
+
+            /* Select the minimum of minimums - 4elt sorting network */
+            /* TODO There's likely a good 1-warp faster way to do this */
+            {
+                F   tempFreq;
+                int tempIsLeaf;
+                if (midFreq[1] > midFreq[3]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[2]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+                if (midFreq[0] > midFreq[1]) {
+                    tempFreq     = midFreq[0];
+                    midFreq[0]   = midFreq[1];
+                    midFreq[1]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[0];
+                    midIsLeaf[0] = midIsLeaf[1];
+                    midIsLeaf[1] = tempIsLeaf;
+                }
+                if (midFreq[2] > midFreq[3]) {
+                    tempFreq     = midFreq[2];
+                    midFreq[2]   = midFreq[3];
+                    midFreq[3]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[2];
+                    midIsLeaf[2] = midIsLeaf[3];
+                    midIsLeaf[3] = tempIsLeaf;
+                }
+                if (midFreq[1] > midFreq[2]) {
+                    tempFreq     = midFreq[1];
+                    midFreq[1]   = midFreq[2];
+                    midFreq[2]   = tempFreq;
+                    tempIsLeaf   = midIsLeaf[1];
+                    midIsLeaf[1] = midIsLeaf[2];
+                    midIsLeaf[2] = tempIsLeaf;
+                }
+            }
+
+            minFreq = midFreq[0];
+            if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; }
+            iNodesFreq[iNodesRear]   = minFreq;
+            iNodesLeader[iNodesRear] = -1;
+
+            /* If is leaf */
+            if (midIsLeaf[0]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size);
+            }
+            if (midIsLeaf[1]) {
+                lNodesLeader[lNodesCur] = iNodesRear;
+                ++CL[lNodesCur], ++lNodesCur;
+            }
+            else {
+                iNodesLeader[iNodesFront] = iNodesRear;
+                iNodesFront               = MOD(iNodesFront + 1, size); /* ? */
+            }
+
+            // iNodesRear = MOD(iNodesRear + 1, size);
+
+            iNodesSize = MOD(iNodesRear - iNodesFront, size);
+        }
+
+        // int curLeavesNum;
+        /* Select elements to copy -- parallelized */
+        curLeavesNum = 0;
+        current_grid.sync();
+        if (i >= lNodesCur && i < size) {
+            // Parallel component
+            int threadCurLeavesNum;
+            if (lNodesFreq[i] <= minFreq) {
+                threadCurLeavesNum = i - lNodesCur + 1;
+                // Atomic max -- Largest valid index
+                atomicMax(&curLeavesNum, threadCurLeavesNum);
+            }
+
+            if (i - lNodesCur < curLeavesNum) {
+                copyFreq[i - lNodesCur]   = lNodesFreq[i];
+                copyIndex[i - lNodesCur]  = i;
+                copyIsLeaf[i - lNodesCur] = 1;
+            }
+        }
+
+        current_grid.sync();
+
+        /* Updates Iterators */
+        if (thread == 0) {
+            mergeRear  = iNodesRear;
+            mergeFront = iNodesFront;
+
+            if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; }
+            /* Odd number of nodes to merge - leave out one*/
+            else if (
+                (iNodesSize != 0)                                                                        //
+                and (curLeavesNum == 0                                                                   //
+                     or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)]))  //
+            ) {
+                mergeRear   = MOD(mergeRear - 1, size);
+                iNodesFront = MOD(iNodesRear - 1, size);
+            }
+            else {
+                iNodesFront = iNodesRear;
+                --curLeavesNum;
+            }
+
+            lNodesCur  = lNodesCur + curLeavesNum;
+            iNodesRear = MOD(iNodesRear + 1, size);
+        }
+        current_grid.sync();
+
+        /* Parallelized Merging Phase */
+
+        /*if (thread == 0) {
+            merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,
+                    iNodesFreq, mergeFront, mergeRear, size,
+                    tempFreq, tempIndex, tempIsLeaf, tempLength);
+                    }*/
+
+        parMerge(
+            copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum,  //
+            iNodesFreq, mergeFront, mergeRear, size,           //
+            tempFreq, tempIndex, tempIsLeaf, tempLength,       //
+            diagonal_path_intersections, mblocks, mthreads,    //
+            x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+        current_grid.sync();
+
+        /* Melding phase -- New */
+        if (thread < tempLength / 2) {
+            int ind           = MOD(iNodesRear + i, size);
+            iNodesFreq[ind]   = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1];
+            iNodesLeader[ind] = -1;
+
+            if (tempIsLeaf[(2 * i)]) {
+                lNodesLeader[tempIndex[(2 * i)]] = ind;
+                ++CL[tempIndex[(2 * i)]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i)]] = ind;
+            }
+            if (tempIsLeaf[(2 * i) + 1]) {
+                lNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+                ++CL[tempIndex[(2 * i) + 1]];
+            }
+            else {
+                iNodesLeader[tempIndex[(2 * i) + 1]] = ind;
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); }
+        current_grid.sync();
+
+        /* Update leaders */
+        if (thread < size) {
+            if (lNodesLeader[i] != -1) {
+                if (iNodesLeader[lNodesLeader[i]] != -1) {
+                    lNodesLeader[i] = iNodesLeader[lNodesLeader[i]];
+                    ++CL[i];
+                }
+            }
+        }
+        current_grid.sync();
+
+        if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); }
+        current_grid.sync();
+    }
+}
+
+// Parallelized with atomic writes, but could replace with Jiannan's similar code
+template <typename F, typename H>
+__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size)
+{
+    unsigned int       thread       = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const unsigned int i            = thread;  // Porting convenience
+    auto               current_grid = cg::this_grid();
+    auto               type_bw      = sizeof(H) * 8;
+
+    /* Reverse in place - Probably a more CUDA-appropriate way */
+    if (thread < size / 2) {
+        F temp           = CL[i];
+        CL[i]            = CL[size - i - 1];
+        CL[size - i - 1] = temp;
+    }
+    current_grid.sync();
+
+    if (thread == 0) {
+        CCL        = CL[0];
+        CDPI       = 0;
+        newCDPI    = size - 1;
+        entry[CCL] = 0;
+
+        // Edge case -- only one input symbol
+        CW[CDPI]       = 0;
+        first[CCL]     = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        entry[CCL + 1] = 1;
+    }
+    current_grid.sync();
+
+    // Initialize first and entry arrays
+    if (thread < CCL) {
+        // Initialization of first to Max ensures that unused code
+        // lengths are skipped over in decoding.
+        first[i] = std::numeric_limits<H>::max();
+        entry[i] = 0;
+    }
+    // Initialize first element of entry
+    current_grid.sync();
+
+    while (CDPI < size - 1) {
+        // CDPI update
+        if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); }
+        current_grid.sync();
+
+        // Last element to update
+        const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1];
+        // Fill base
+        const int curEntryVal = entry[CCL];
+        // Number of elements of length CCL
+        const int numCCL = (newCDPI - CDPI + 1);
+
+        // Get first codeword
+        if (i == 0) {
+            if (CDPI == 0) { CW[newCDPI] = 0; }
+            else {
+                CW[newCDPI] = CW[CDPI];  // Pre-stored
+            }
+        }
+        current_grid.sync();
+
+        if (i < size) {
+            // Parallel canonical codeword generation
+            if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); }
+        }
+
+        // Update entry and first arrays in O(1) time
+        if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; }
+        // Add number of entries to next CCL
+        if (thread == 0) {
+            if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; }
+        }
+        current_grid.sync();
+
+        // Update first array in O(1) time
+        if (thread == CCL) {
+            // Flip least significant CL[CDPI] bits
+            first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1);
+        }
+        if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits<H>::max(); }
+        current_grid.sync();
+
+        if (thread == 0) {
+            if (newCDPI < size - 1) {
+                int CLDiff = CL[newCDPI + 1] - CL[newCDPI];
+                // Add and shift -- Next canonical code
+                CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff);
+                CCL             = CL[newCDPI + 1];
+
+                ++newCDPI;
+            }
+
+            // Update CDPI to newCDPI after codeword length increase
+            CDPI    = newCDPI;
+            newCDPI = size - 1;
+        }
+        current_grid.sync();
+    }
+
+    if (thread < size) {
+        /* Make encoded codeword compatible with CUSZ */
+        CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1);
+    }
+    current_grid.sync();
+
+    /* Reverse partial codebook */
+    if (thread < size / 2) {
+        H temp           = CW[i];
+        CW[i]            = CW[size - i - 1];
+        CW[size - i - 1] = temp;
+    }
+}
+
+// TODO forceinilne?
+// Helper implementations
+template <typename T>
+__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size) { array[thread] = thread; }
+}
+
+// Precondition -- Result is preset to be equal to size
+template <typename T>
+__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (array[thread] != 0) { atomicMin(result, thread); }
+}
+
+namespace par_huffman {
+namespace detail {
+__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result)
+{
+    (void)size;
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread == 0) { *result = CL[0]; }
+}
+
+}  // namespace detail
+}  // namespace par_huffman
+
+/**
+ * @brief Reorders given a set of indices. Programmer must ensure that all index[i]
+ * are unique or else race conditions may occur
+ *
+ * @tparam T
+ * @tparam Q
+ * @param array e.g., codebook
+ * @param index e.g., input data
+ * @param size
+ * @return __global__
+ */
+template <typename H, typename T>
+__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    H            temp;
+    T            newIndex;
+    if (thread < size) {
+        temp                 = array[thread];
+        newIndex             = index[thread];
+        array[(int)newIndex] = temp;
+    }
+}
+
+// Reverses a given array.
+template <typename T>
+__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size)
+{
+    unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (thread < size / 2) {
+        T temp                   = array[thread];
+        array[thread]            = array[size - thread - 1];
+        array[size - thread - 1] = temp;
+    }
+}
+
+// Parallel codebook generation wrapper
+template <typename T, typename H>
+void asz::hf_buildbook_g(
+    uint32_t*    freq,
+    int const    dict_size,
+    H*           codebook,
+    uint8_t*     reverse_codebook,
+    int const    revbook_nbyte,
+    float*       time_book,
+    cudaStream_t stream)
+{
+    // Metadata
+    auto type_bw  = sizeof(H) * 8;
+    auto _d_first = reinterpret_cast<H*>(reverse_codebook);
+    auto _d_entry = reinterpret_cast<H*>(reverse_codebook + (sizeof(H) * type_bw));
+    auto _d_qcode = reinterpret_cast<T*>(reverse_codebook + (sizeof(H) * 2 * type_bw));
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    // Sort Qcodes by frequency
+    int nblocks = (dict_size / 1024) + 1;
+    par_huffman::detail::GPU_FillArraySequence<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    /**
+     * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu)
+     * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate
+     * compilation unit as thrust calls fail in par_huffman.cu.
+     *
+     * Resolved by
+     * 1) inlining function
+     * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer<T>(var)`
+     */
+    auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) {
+        thrust::sort_by_key(
+            thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len),
+            thrust::device_pointer_cast(qcode));
+    };
+
+    lambda_sort_by_freq(freq, dict_size, _d_qcode);
+    cudaStreamSynchronize(stream);
+
+    unsigned int* d_first_nonzero_index;
+    unsigned int  first_nonzero_index = dict_size;
+    cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int));
+    cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice);
+    par_huffman::detail::GPU_GetFirstNonzeroIndex<unsigned int>
+        <<<nblocks, 1024>>>(freq, dict_size, d_first_nonzero_index);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_first_nonzero_index);
+
+    int           nz_dict_size   = dict_size - first_nonzero_index;
+    unsigned int* _nz_d_freq     = freq + first_nonzero_index;
+    H*            _nz_d_codebook = codebook + first_nonzero_index;
+    int           nz_nblocks     = (nz_dict_size / 1024) + 1;
+
+    // Memory Allocation -- Perhaps put in another wrapper
+    // clang-format off
+    unsigned int *CL         = nullptr;
+    /*unsigned int* lNodesFreq*/         int *lNodesLeader = nullptr;
+    unsigned int *iNodesFreq = nullptr;  int *iNodesLeader = nullptr;
+    unsigned int *tempFreq   = nullptr;  int *tempIsLeaf   = nullptr;  int *tempIndex = nullptr;
+    unsigned int *copyFreq   = nullptr;  int *copyIsLeaf   = nullptr;  int *copyIndex = nullptr;
+    cudaMalloc(&CL,           nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&iNodesFreq,   nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&tempIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&tempIndex,    nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyFreq,     nz_dict_size * sizeof(unsigned int) );
+    cudaMalloc(&copyIsLeaf,   nz_dict_size * sizeof(int)          );
+    cudaMalloc(&copyIndex,    nz_dict_size * sizeof(int)          );
+    cudaMemset(CL, 0,         nz_dict_size * sizeof(int)          );
+    // clang-format on
+
+    // Grid configuration for CL -- based on Cooperative Groups
+    int            cg_mblocks;
+    int            cg_blocks_sm;
+    int            device_id;
+    int            mthreads = 32;  // 1 warp
+    cudaDeviceProp deviceProp;
+    cudaGetDevice(&device_id);
+    cudaGetDeviceProperties(&deviceProp, device_id);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &cg_blocks_sm, par_huffman::GPU_GenerateCL<unsigned int>, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm;
+
+    int ELTS_PER_SEQ_MERGE = 16;
+    int mblocks            = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int tthreads = mthreads * mblocks;
+    if (tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        throw std::system_error();
+        // exit(1);
+    }
+
+    uint32_t* diagonal_path_intersections;
+    cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t));
+
+    // Codebook already init'ed
+    cudaStreamSynchronize(stream);
+
+    // Call first kernel
+    // Collect arguments
+    void* CL_Args[] = {(void*)&_nz_d_freq,   (void*)&CL,
+                       (void*)&nz_dict_size, (void*)&_nz_d_freq,
+                       (void*)&lNodesLeader, (void*)&iNodesFreq,
+                       (void*)&iNodesLeader, (void*)&tempFreq,
+                       (void*)&tempIsLeaf,   (void*)&tempIndex,
+                       (void*)&copyFreq,     (void*)&copyIsLeaf,
+                       (void*)&copyIndex,    (void*)&diagonal_path_intersections,
+                       (void*)&mblocks,      (void*)&mthreads};
+    // Cooperative Launch
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCL<unsigned int>, mblocks, mthreads, CL_Args,
+        5 * sizeof(int32_t) + 32 * sizeof(int32_t));
+    cudaStreamSynchronize(stream);
+
+    // Exits if the highest codeword length is greater than what
+    // the adaptive representation can handle
+    // TODO do  proper cleanup
+
+    unsigned int* d_max_CL;
+    unsigned int  max_CL;
+    cudaMalloc(&d_max_CL, sizeof(unsigned int));
+    par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL);
+    cudaStreamSynchronize(stream);
+    cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaFree(d_max_CL);
+
+    int max_CW_bits = (sizeof(H) * 8) - 8;
+    if (max_CL > max_CW_bits) {
+        cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl;
+        cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8
+             << " bits (longest codeword: " << max_CL << " bits)" << endl;
+        // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::runtime_error("Falling back to 8-byte Codec.");
+    }
+
+    // Configure CW for 1024 threads/block
+    int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024;
+    int cw_mblocks    = std::min(cg_cw_mblocks, nz_nblocks);
+
+    // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded
+    int cw_tthreads = cw_mblocks * 1024;
+    if (cw_tthreads < nz_dict_size) {
+        cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size
+             << " non-zero item codebook" << endl;
+        cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads
+             << " total" << endl
+             << endl;
+        // cout << LOG_ERR << "Exiting cuSZ ..." << endl;
+        // exit(1);
+        throw std::system_error();
+    }
+
+    void* CW_Args[] = {
+        (void*)&CL,              //
+        (void*)&_nz_d_codebook,  //
+        (void*)&_d_first,        //
+        (void*)&_d_entry,        //
+        (void*)&nz_dict_size};
+
+    // Call second kernel
+    cudaLaunchCooperativeKernel(
+        (void*)par_huffman::GPU_GenerateCW<unsigned int, H>,  //
+        cw_mblocks,                                           //
+        1024,                                                 //
+        CW_Args);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+
+    // Reverse _d_qcode and codebook
+    par_huffman::detail::GPU_ReverseArray<H><<<nblocks, 1024>>>(codebook, (unsigned int)dict_size);
+    par_huffman::detail::GPU_ReverseArray<T><<<nblocks, 1024>>>(_d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    par_huffman::detail::GPU_ReorderByIndex<H, T><<<nblocks, 1024>>>(codebook, _d_qcode, (unsigned int)dict_size);
+    cudaStreamSynchronize(stream);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    TIME_ELAPSED_CUDAEVENT(time_book);
+    DESTROY_CUDAEVENT_PAIR;
+
+    // Cleanup
+    cudaFree(CL);
+    cudaFree(lNodesLeader);
+    cudaFree(iNodesFreq);
+    cudaFree(iNodesLeader);
+    cudaFree(tempFreq);
+    cudaFree(tempIsLeaf);
+    cudaFree(tempIndex);
+    cudaFree(copyFreq);
+    cudaFree(copyIsLeaf);
+    cudaFree(copyIndex);
+    cudaFree(diagonal_path_intersections);
+    cudaStreamSynchronize(stream);
+
+#ifdef D_DEBUG_PRINT
+    print_codebook<H><<<1, 32>>>(codebook, dict_size);  // PASS
+    cudaStreamSynchronize(stream);
+#endif
+}
+
+#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
new file mode 100644
index 00000000..2e8cf159
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl
@@ -0,0 +1,296 @@
+/**
+ * @file codec_huffman.cuh
+ * @author Jiannan Tian
+ * @brief Huffman kernel definitions
+ * @version 0.2
+ * @date 2020-02-13
+ * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+
+#include "common.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+#include "hf/hf_struct.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#define TIX threadIdx.x
+#define BIX blockIdx.x
+#define BDX blockDim.x
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+using BYTE = uint8_t;
+
+extern __shared__ char __codec_huffman_uninitialized[];
+
+struct __helper {
+    __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; }
+    __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; }
+    __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; }
+    __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int global_tid()
+    {
+        return blockIdx.x * blockDim.x * SEQ + threadIdx.x;
+    }
+    template <int SEQ>
+    __device__ __forceinline__ static unsigned int grid_stride()
+    {
+        return blockDim.x * gridDim.x * SEQ;
+    }
+};
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed);
+
+namespace asz {
+namespace detail {
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded);
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg);
+
+template <typename Huff, typename Meta>
+__global__ void
+hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped);
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook);
+
+}  // namespace detail
+}  // namespace asz
+
+// TODO change size_t to unsigned int
+template <typename COMPRESSED, typename UNCOMPRESSED>
+__device__ void
+asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook)
+{
+    static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8;
+
+    int  next_bit;
+    auto idx_bit  = 0;
+    auto idx_byte = 0;
+    auto idx_out  = 0;
+
+    COMPRESSED bufr = input[idx_byte];
+
+    auto       first = reinterpret_cast<COMPRESSED*>(revbook);
+    auto       entry = first + DTYPE_WIDTH;
+    auto       keys  = reinterpret_cast<UNCOMPRESSED*>(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH));
+    COMPRESSED v     = (bufr >> (DTYPE_WIDTH - 1)) & 0x1;  // get the first bit
+    auto       l     = 1;
+    auto       i     = 0;
+
+    while (i < total_bw) {
+        while (v < first[l]) {  // append next i_cb bit
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [1:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [1:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = (v << 1) | next_bit;
+            ++l;
+        }
+        out[idx_out++] = keys[entry[l] + v - first[l]];
+        {
+            ++i;
+            idx_byte = i / DTYPE_WIDTH;  // [2:exclusive]
+            idx_bit  = i % DTYPE_WIDTH;
+            if (idx_bit == 0) {
+                // idx_byte += 1; // [2:exclusive]
+                bufr = input[idx_byte];
+            }
+
+            next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1);
+            v        = 0x0 | next_bit;
+        }
+        l = 1;
+    }
+}
+
+template <typename UNCOMPRESSED, typename ENCODED>
+__global__ void asz::detail::hf_encode_phase1_fill(
+    UNCOMPRESSED* in_uncompressed,
+    size_t const  in_uncompressed_len,
+    ENCODED*      in_book,
+    int const     in_booklen,
+    ENCODED*      out_encoded)
+{
+    auto shmem_cb = reinterpret_cast<ENCODED*>(__codec_huffman_uninitialized);
+
+    // load from global memory
+    for (auto idx = __helper::local_tid_1();  //
+         idx < in_booklen;                    //
+         idx += __helper::block_stride_1())
+        shmem_cb[idx] = in_book[idx];
+
+    __syncthreads();
+
+    for (auto idx = __helper::global_tid_1();  //
+         idx < in_uncompressed_len;            //
+         idx += __helper::grid_stride_1()      //
+    )
+        out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]];
+}
+
+template <typename COMPRESSED, typename MetadataT>
+__global__ void asz::detail::hf_encode_phase2_deflate(
+    COMPRESSED*  inout_inplace,
+    size_t const len,
+    MetadataT*   par_nbit,
+    MetadataT*   par_ncell,
+    int const    sublen,
+    int const    pardeg)
+{
+    constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8;
+
+    auto tid = BIX * BDX + TIX;
+
+    if (tid * sublen < len) {
+        int         residue_bits = CELL_BITWIDTH;
+        int         total_bits   = 0;
+        COMPRESSED* ptr          = inout_inplace + tid * sublen;
+        COMPRESSED  bufr;
+        uint8_t     word_width;
+
+        auto did = tid * sublen;
+        for (auto i = 0; i < sublen; i++, did++) {
+            if (did == len) break;
+
+            COMPRESSED packed_word = inout_inplace[tid * sublen + i];
+            auto       word_ptr    = reinterpret_cast<struct PackedWordByWidth<sizeof(COMPRESSED)>*>(&packed_word);
+            word_width             = word_ptr->bits;
+            word_ptr->bits         = (uint8_t)0x0;
+
+            if (residue_bits == CELL_BITWIDTH) {  // a new unit of compact format
+                bufr = 0x0;
+            }
+            ////////////////////////////////////////////////////////////////
+
+            if (word_width <= residue_bits) {
+                residue_bits -= word_width;
+                bufr |= packed_word << residue_bits;
+
+                if (residue_bits == 0) {
+                    residue_bits = CELL_BITWIDTH;
+                    *(ptr++)     = bufr;
+                }
+            }
+            else {
+                // example: we have 5-bit code 11111 but 3 bits available in (*ptr)
+                // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB
+                // ^^^                                        ^^
+                auto l_bits = word_width - residue_bits;
+                auto r_bits = CELL_BITWIDTH - l_bits;
+
+                bufr |= packed_word >> l_bits;
+                *(ptr++) = bufr;
+                bufr     = packed_word << r_bits;
+
+                residue_bits = r_bits;
+            }
+            total_bits += word_width;
+        }
+        *ptr = bufr;  // manage the last unit
+
+        par_nbit[tid]  = total_bits;
+        par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH;
+    }
+}
+
+template <typename Huff, typename Meta>
+__global__ void asz::detail::hf_encode_phase4_concatenate(
+    Huff*     gapped,
+    Meta*     par_entry,
+    Meta*     par_ncell,
+    int const cfg_sublen,
+    Huff*     non_gapped)
+{
+    auto n   = par_ncell[blockIdx.x];
+    auto src = gapped + cfg_sublen * blockIdx.x;
+    auto dst = non_gapped + par_entry[blockIdx.x];
+
+    for (auto i = threadIdx.x; i < n; i += blockDim.x) {  // block-stride
+        dst[i] = src[i];
+    }
+}
+
+template <typename UNCOMPRESSED, typename COMPRESSED, typename MetadataT>
+__global__ void hf_decode_kernel(
+    COMPRESSED*   compressed,
+    uint8_t*      revbook,
+    MetadataT*    par_nbit,
+    MetadataT*    par_entry,
+    int const     revbook_nbyte,
+    int const     sublen,
+    int const     pardeg,
+    UNCOMPRESSED* out_uncompressed)
+{
+    extern __shared__ uint8_t shmem[];
+    constexpr auto            block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+
+    auto R = (revbook_nbyte - 1 + block_dim) / block_dim;
+
+    for (auto i = 0; i < R; i++) {
+        if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim];
+    }
+    __syncthreads();
+
+    auto gid = BIX * BDX + TIX;
+
+    if (gid < pardeg) {
+        asz::detail::hf_decode_single_thread_inflate(
+            compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem);
+        __syncthreads();
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
new file mode 100644
index 00000000..4ed9b580
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl
@@ -0,0 +1,364 @@
+/**
+ * @file huffman_coarse.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH
+
+#include <cuda.h>
+// #include <clocale>
+// #include <cstdint>
+// #include <exception>
+// #include <functional>
+#include <iostream>
+#include <numeric>
+// #include <type_traits>
+
+using std::cout;
+
+#include "common/definition.hh"
+#include "common/type_traits.hh"
+#include "utils.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+/******************************************************************************
+                            macros for shorthand writing
+ ******************************************************************************/
+
+#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD];
+
+#define DEVICE2DEVICE_COPY(VAR, FIELD)                                            \
+    {                                                                             \
+        constexpr auto D2D = cudaMemcpyDeviceToDevice;                            \
+        auto           dst = d_compressed + header.entry[Header::FIELD];          \
+        auto           src = reinterpret_cast<BYTE*>(d_##VAR);                    \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \
+    }
+
+#define ACCESSOR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header.entry[Header::SYM])
+
+#define HC_ALLOCHOST(VAR, SYM)                     \
+    cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \
+    memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_ALLOCDEV(VAR, SYM)                  \
+    cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \
+    cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]);
+
+#define HC_FREEHOST(VAR)       \
+    if (h_##VAR) {             \
+        cudaFreeHost(h_##VAR); \
+        h_##VAR = nullptr;     \
+    }
+
+#define HC_FREEDEV(VAR)    \
+    if (d_##VAR) {         \
+        cudaFree(d_##VAR); \
+        d_##VAR = nullptr; \
+    }
+
+/******************************************************************************
+                                class definition
+ ******************************************************************************/
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define IMPL LosslessCodec<T, H, M>::impl
+
+namespace cusz {
+
+TEMPLATE_TYPE
+IMPL::~impl()
+{
+    HC_FREEDEV(tmp);
+    HC_FREEDEV(book);
+    HC_FREEDEV(revbook);
+    HC_FREEDEV(par_nbit);
+    HC_FREEDEV(par_ncell);
+    HC_FREEDEV(par_entry);
+    HC_FREEDEV(bitstream);
+
+    HC_FREEHOST(book);
+    HC_FREEHOST(revbook);
+    HC_FREEHOST(par_nbit);
+    HC_FREEHOST(par_ncell);
+    HC_FREEHOST(par_entry);
+}
+
+TEMPLATE_TYPE
+IMPL::impl() = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); };
+
+    auto debug = [&]() {
+        setlocale(LC_NUMERIC, "");
+        printf("\nHuffmanCoarse<T, H, M>::init() debugging:\n");
+        printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr));
+        dbg_println("TMP", d_tmp, RTE::TMP);
+        dbg_println("BOOK", d_book, RTE::BOOK);
+        dbg_println("REVBOOK", d_revbook, RTE::REVBOOK);
+        dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT);
+        dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL);
+        dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM);
+        printf("\n");
+    };
+
+    memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END);
+    // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1));
+
+    rte.nbyte[RTE::TMP]       = sizeof(H) * in_uncompressed_len;
+    rte.nbyte[RTE::BOOK]      = sizeof(H) * booklen;
+    rte.nbyte[RTE::REVBOOK]   = get_revbook_nbyte(booklen);
+    rte.nbyte[RTE::PAR_NBIT]  = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg;
+    rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes();
+
+    HC_ALLOCDEV(tmp, TMP);
+
+    {
+        auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK];
+        cudaMalloc(&d_book, total_bytes);
+        cudaMemset(d_book, 0x0, total_bytes);
+
+        d_revbook = reinterpret_cast<uint8_t*>(d_book + booklen);
+    }
+
+    {
+        cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        d_par_nbit  = d_par_metadata;
+        d_par_ncell = d_par_metadata + pardeg;
+        d_par_entry = d_par_metadata + pardeg * 2;
+    }
+
+    HC_ALLOCDEV(bitstream, BITSTREAM);
+
+    // standalone definition for output
+    d_compressed = reinterpret_cast<BYTE*>(d_tmp);
+
+    HC_ALLOCHOST(book, BOOK);
+    HC_ALLOCHOST(revbook, REVBOOK);
+
+    {
+        cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3);
+        // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3);
+
+        h_par_nbit  = h_par_metadata;
+        h_par_ncell = h_par_metadata + pardeg;
+        h_par_entry = h_par_metadata + pardeg * 2;
+    }
+
+    int numSMs;
+    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
+
+    int sublen = (in_uncompressed_len - 1) / pardeg + 1;
+
+    book_desc      = new hf_book{nullptr, d_book, booklen};
+    chunk_desc_d   = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry};
+    chunk_desc_h   = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry};
+    bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs};
+
+    if (dbg_print) debug();
+}
+
+TEMPLATE_TYPE
+void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream)
+{
+    book_desc->freq = freq;
+    asz::hf_buildbook_g<T, H>(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    time_lossless = 0;
+
+    struct Header header;
+
+    asz::hf_encode_coarse_rev1<T, H, M>(
+        in_uncompressed, in_uncompressed_len,  //
+        book_desc, bitstream_desc,             //
+        out_compressed, out_compressed_len, time_lossless, stream);
+
+    header.total_nbit =
+        std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0);
+    header.total_ncell =
+        std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0);
+    // update with the precise BITSTREAM nbyte
+    rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell;
+
+    // d_revbook and revbook_nbyte is hidden; need to improve here
+    subfile_collect(
+        header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream);
+
+    out_compressed     = d_compressed;
+    out_compressed_len = header.subfile_size();
+}
+
+TEMPLATE_TYPE
+void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    Header header;
+    if (header_on_device)
+        CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream));
+
+    auto d_revbook   = ACCESSOR(REVBOOK, BYTE);
+    auto d_par_nbit  = ACCESSOR(PAR_NBIT, M);
+    auto d_par_entry = ACCESSOR(PAR_ENTRY, M);
+    auto d_bitstream = ACCESSOR(BITSTREAM, H);
+
+    auto const revbook_nbyte = get_revbook_nbyte(header.booklen);
+
+    // launch_coarse_grained_Huffman_decoding<T, H, M>(
+    asz::hf_decode_coarse<T, H, M>(
+        d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed,
+        time_lossless, stream);
+}
+
+TEMPLATE_TYPE
+void IMPL::clear_buffer()
+{
+    cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]);
+    cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]);
+    cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]);
+    cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]);
+    cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]);
+    cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]);
+    cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]);
+}
+
+// private helper
+TEMPLATE_TYPE
+void IMPL::subfile_collect(
+    Header&      header,
+    size_t const in_uncompressed_len,
+    int const    booklen,
+    int const    sublen,
+    int const    pardeg,
+    cudaStream_t stream)
+{
+    auto BARRIER = [&]() {
+        if (stream)
+            CHECK_CUDA(cudaStreamSynchronize(stream));
+        else
+            CHECK_CUDA(cudaDeviceSynchronize());
+    };
+
+    header.self_bytes       = sizeof(Header);
+    header.booklen          = booklen;
+    header.sublen           = sublen;
+    header.pardeg           = pardeg;
+    header.uncompressed_len = in_uncompressed_len;
+
+    MetadataT nbyte[Header::END];
+    nbyte[Header::HEADER] = sizeof(Header);
+
+    EXPORT_NBYTE(REVBOOK)
+    EXPORT_NBYTE(PAR_NBIT)
+    EXPORT_NBYTE(PAR_ENTRY)
+    EXPORT_NBYTE(BITSTREAM)
+
+    header.entry[0] = 0;
+    // *.END + 1: need to know the ending position
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; }
+    for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+    // auto debug_header_entry = [&]() {
+    //     for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]);
+    // };
+    // debug_header_entry();
+
+    CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream));
+
+    /* debug */ BARRIER();
+
+    DEVICE2DEVICE_COPY(revbook, REVBOOK)
+    DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT)
+    DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY)
+    DEVICE2DEVICE_COPY(bitstream, BITSTREAM)
+}
+
+// getter
+TEMPLATE_TYPE
+float IMPL::get_time_elapsed() const { return milliseconds; }
+
+TEMPLATE_TYPE
+float IMPL::get_time_book() const { return time_book; }
+TEMPLATE_TYPE
+float IMPL::get_time_lossless() const { return time_lossless; }
+
+TEMPLATE_TYPE
+H* IMPL::expose_book() const { return d_book; }
+
+TEMPLATE_TYPE
+BYTE* IMPL::expose_revbook() const { return d_revbook; }
+
+// TODO this kind of space will be overlapping with quant-codes
+TEMPLATE_TYPE
+size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; }
+
+TEMPLATE_TYPE
+size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; }
+
+TEMPLATE_TYPE
+constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); }
+
+// auxiliary
+TEMPLATE_TYPE
+void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM)
+{
+    CUdeviceptr pbase0{0};
+    size_t      psize0{0};
+
+    cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR);
+    printf(
+        "%s:\n"
+        "\t(supposed) pointer : %p\n"
+        "\t(supposed) bytes   : %'9lu\n"
+        "\t(queried)  pbase0  : %p\n"
+        "\t(queried)  psize0  : %'9lu\n",
+        SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0);
+    pbase0 = 0, psize0 = 0;
+}
+
+}  // namespace cusz
+
+#undef HC_ALLOCDEV
+#undef HC_ALLOCHOST
+#undef HC_FREEDEV
+#undef HC_FREEHOST
+#undef EXPORT_NBYTE
+#undef ACCESSOR
+#undef DEVICE2DEVICE_COPY
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif
diff --git a/qtensor/compression/cusz/src/hf/detail/par_merge.inl b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
new file mode 100644
index 00000000..70068967
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/detail/par_merge.inl
@@ -0,0 +1,445 @@
+/*
+ * Authors:
+ *  Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)
+ *  High Performance Computing Lab, Georgia Tech
+ *
+ * Future Publication:
+ * GPU MergePath: A GPU Merging Algorithm
+ * ACM International Conference on Supercomputing 2012
+ * June 25-29 2012, San Servolo, Venice, Italy
+ *
+ * (C) 2012 Georgia Institute of Technology
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the Georgia Institute of Technology nor the names of
+ *   its contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file par_merge.h
+ * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com))
+ * @brief Modified and adapted by Cody Rivera
+ * @version 0.3
+ * @date 2020-10-24
+ * (created) 2020-06 (rev) 2021-06-21
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PAR_MERGE_CUH
+#define CUSZ_KERNEL_PAR_MERGE_CUH
+
+#include <cuda.h>
+#include <float.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+// Mathematically correct modulo
+#define MOD(a, b) ((((a) % (b)) + (b)) % (b))
+
+/* MERGETYPE
+ * Performs <runs> merges of two sorted pseudorandom <vec_t> arrays of length <size>
+ * Times the runs and reports on the average time
+ * Checks the output of each merge for correctness
+ */
+#define PADDING 1024
+
+/********************************************************************************
+ * signature
+ ********************************************************************************/
+
+// Partition array
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+// Merge partitions
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int       tempLength);
+
+template <typename F>
+__device__ void parMerge(
+    F*        copyFreq,
+    int*      copyIndex,
+    int*      copyIsLeaf,
+    int       cStart,
+    int       cEnd,
+    F*        iNodesFreq,
+    int       iStart,
+    int       iEnd,
+    int       iNodesCap,
+    F*        tempFreq,
+    int*      tempIndex,
+    int*      tempIsLeaf,
+    int&      tempLength,
+    uint32_t* diagonal_path_intersections,
+    int       blocks,
+    int       threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,
+    int* copyIndex,
+    int* copyIsLeaf,
+    int  cStart,
+    int  cEnd,
+    F*   iNodesFreq,
+    int  iStart,
+    int  iEnd,
+    int  iNodesCap,
+    F*   tempFreq,
+    int* tempIndex,
+    int* tempIsLeaf,
+    int& tempLength);
+
+/********************************************************************************
+ * definition
+ ********************************************************************************/
+
+// clang-format off
+template <typename F>
+__device__ void parMerge(
+    F* copyFreq,    int* copyIndex,  int* copyIsLeaf,   int  cStart,    int cEnd,
+    F* iNodesFreq,  int  iStart,     int  iEnd,         int  iNodesCap,
+    F* tempFreq,    int* tempIndex,  int* tempIsLeaf,   int& tempLength,
+    uint32_t* diagonal_path_intersections, int blocks,  int  threads,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top,  int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+    {
+    // clang-format on
+    auto current_grid = cg::this_grid();
+    current_grid.sync();
+    tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap);
+
+    if (tempLength == 0) return;
+
+    // Perform the global diagonal intersection serach to divide work among SMs
+    cudaWorkloadDiagonals<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        x_top, y_top, x_bottom, y_bottom, found, oneorzero);
+    current_grid.sync();
+
+    // Merge between global diagonals independently on each block
+    cudaMergeSinglePath<F>(
+        copyFreq, copyIndex, copyIsLeaf, cStart, cEnd,  //
+        iNodesFreq, iStart, iEnd, iNodesCap,            //
+        diagonal_path_intersections,                    //
+        tempFreq, tempIndex, tempIsLeaf, tempLength);
+    current_grid.sync();
+}
+
+/* CUDAWORKLOADDIAGONALS
+ * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path.
+ * This divides the workload into independent merges for the next step
+ */
+// clang-format off
+template <typename F>
+__device__ void cudaWorkloadDiagonals(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    /* Shared Memory */
+    int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom,
+    int32_t& found, int32_t* oneorzero)
+{
+    // clang-format on
+    uint32_t A_length = cEnd - cStart;
+    uint32_t B_length = MOD(iEnd - iStart, iNodesCap);
+    // Calculate combined index around the MergePath "matrix"
+    int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x;
+    /*
+    __shared__ int32_t x_top, y_top, x_bottom, y_bottom,  found;
+    __shared__ int32_t oneorzero[32];
+    */
+    int threadOffset = threadIdx.x - 16;
+
+    if (threadIdx.x < 32) {
+        // Figure out the coordinates of our diagonal
+        if (A_length >= B_length) {
+            x_top    = MIN(combinedIndex, A_length);
+            y_top    = combinedIndex > A_length ? combinedIndex - (A_length) : 0;
+            x_bottom = y_top;
+            y_bottom = x_top;
+        }
+        else {
+            y_bottom = MIN(combinedIndex, B_length);
+            x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0;
+            y_top    = x_bottom;
+            x_top    = y_bottom;
+        }
+    }
+
+    // if (threadIdx.x == 0) {
+    //    printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom);
+    //}
+
+    found = 0;
+
+    // Search the diagonal
+    while (!found) {
+        // Update our coordinates within the 32-wide section of the diagonal
+        int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset;
+        int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset;
+        int32_t getfrom_x = current_x + cStart - 1;
+        // Below statement is a more efficient, divmodless version of the following
+        // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap);
+        int32_t getfrom_y = iStart + current_y;
+
+        if (threadIdx.x < 32) {
+            if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap;
+
+            // Are we a '1' or '0' with respect to A[x] <= B[x]
+            if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; }
+            else if (current_y >= (int32_t)B_length || current_x < 1) {
+                oneorzero[threadIdx.x] = 1;
+            }
+            else {
+                oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0;
+            }
+        }
+
+        __syncthreads();
+
+        // If we find the meeting of the '1's and '0's, we found the
+        // intersection of the path and diagonal
+        if (threadIdx.x > 0 and                                     //
+            threadIdx.x < 32 and                                    //
+            (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1])  //
+        ) {
+            found = 1;
+
+            diagonal_path_intersections[blockIdx.x]                 = current_x;
+            diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y;
+        }
+
+        __syncthreads();
+
+        // Adjust the search window on the diagonal
+        if (threadIdx.x == 16) {
+            if (oneorzero[31] != 0) {
+                x_bottom = current_x;
+                y_bottom = current_y;
+            }
+            else {
+                x_top = current_x;
+                y_top = current_y;
+            }
+        }
+        __syncthreads();
+    }
+
+    // Set the boundary diagonals (through 0,0 and A_length,B_length)
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        diagonal_path_intersections[0]                         = 0;
+        diagonal_path_intersections[gridDim.x + 1]             = 0;
+        diagonal_path_intersections[gridDim.x]                 = A_length;
+        diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length;
+    }
+}
+
+// Serial merge
+// clang-format off
+template <typename F>
+__device__ void merge(
+    F*   copyFreq,   int* copyIndex, int* copyIsLeaf, int  cStart,    int  cEnd,
+    F*   iNodesFreq, int  iStart,    int  iEnd,       int  iNodesCap,
+    F*   tempFreq,   int* tempIndex, int* tempIsLeaf, int& tempLength)
+{
+    // clang-format on
+    int len      = 0;
+    int iterCopy = cStart, iterINodes = iStart;
+
+    while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) {
+            tempFreq[len]   = copyFreq[iterCopy];
+            tempIndex[len]  = copyIndex[iterCopy];
+            tempIsLeaf[len] = copyIsLeaf[iterCopy];
+            ++iterCopy;
+        }
+        else {
+            tempFreq[len]   = iNodesFreq[iterINodes];
+            tempIndex[len]  = iterINodes;
+            tempIsLeaf[len] = 0;
+            iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        }
+        ++len;
+    }
+
+    while (iterCopy < cEnd) {
+        tempFreq[len]   = copyFreq[iterCopy];
+        tempIndex[len]  = copyIndex[iterCopy];
+        tempIsLeaf[len] = copyIsLeaf[iterCopy];
+        ++iterCopy;
+        ++len;
+    }
+    while (MOD(iEnd - iterINodes, iNodesCap) > 0) {
+        tempFreq[len]   = iNodesFreq[iterINodes];
+        tempIndex[len]  = iterINodes;
+        tempIsLeaf[len] = 0;
+        iterINodes      = MOD(iterINodes + 1, iNodesCap);
+        ++len;
+    }
+
+    tempLength = len;
+}
+
+/* CUDAMERGESINGLEPATH
+ * Performs merge windows within a thread block from that block's global diagonal
+ * intersection to the next
+ */
+#define K 512
+#define PAD_SIZE 0
+
+// clang-format off
+template <typename F>
+__device__ void cudaMergeSinglePath(
+    F*  copyFreq,   int* copyIndex, int* copyIsLeaf,
+    int cStart,     int  cEnd,
+    F*  iNodesFreq,
+    int iStart,     int  iEnd,      int  iNodesCap,
+    uint32_t* diagonal_path_intersections,
+    F*  tempFreq,   int* tempIndex, int* tempIsLeaf,
+    int tempLength)
+{
+    // clang-format on
+    // Temporary Code -- Serial Merge Per Block
+    if (threadIdx.x == 0) {
+        // Boundaries
+        int x_block_top  = diagonal_path_intersections[blockIdx.x];
+        int y_block_top  = diagonal_path_intersections[blockIdx.x + gridDim.x + 1];
+        int x_block_stop = diagonal_path_intersections[blockIdx.x + 1];
+        int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2];
+
+        // Actual indexes
+        int x_start = x_block_top + cStart;
+        int x_end   = x_block_stop + cStart;
+        int y_start = MOD(iStart + y_block_top, iNodesCap);
+        int y_end   = MOD(iStart + y_block_stop, iNodesCap);
+
+        int offset = x_block_top + y_block_top;
+
+        int dummy;  // Unused result
+        // TODO optimize serial merging of each partition
+        merge(
+            copyFreq, copyIndex, copyIsLeaf, x_start, x_end,  //
+            iNodesFreq, y_start, y_end, iNodesCap,            //
+            tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy);
+        if (0) {
+            printf(
+                "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top,
+                y_block_stop, dummy);
+        }
+    }
+}
+
+// `unsigned int` instantiations
+template __device__ void parMerge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength,
+    uint32_t*     diagonal_path_intersections,
+    int           blocks,
+    int           threads,
+    /* Shared Memory */
+    int32_t& x_top,
+    int32_t& y_top,
+    int32_t& x_bottom,
+    int32_t& y_bottom,
+    int32_t& found,
+    int32_t* oneorzero);
+
+template __device__ void merge<unsigned int>(
+    unsigned int* copyFreq,
+    int*          copyIndex,
+    int*          copyIsLeaf,
+    int           cStart,
+    int           cEnd,
+    unsigned int* iNodesFreq,
+    int           iStart,
+    int           iEnd,
+    int           iNodesCap,
+    unsigned int* tempFreq,
+    int*          tempIndex,
+    int*          tempIsLeaf,
+    int&          tempLength);
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/hf/hf.cc b/qtensor/compression/cusz/src/hf/hf.cc
new file mode 100644
index 00000000..54b95b25
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf.cc
@@ -0,0 +1,109 @@
+/**
+ * @file codec.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-04-23
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "common/type_traits.hh"
+
+#include "hf/hf.hh"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+namespace cusz {
+
+#define TEMPLATE_TYPE template <typename T, typename H, typename M>
+#define HUFFMAN_COARSE LosslessCodec<T, H, M>
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); }
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique<impl>()} {}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+    // TODO allocation/deep copy
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old)
+{
+    *pimpl = *old.pimpl;
+    // TODO allocation/deep copy
+    return *this;
+}
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default;
+
+TEMPLATE_TYPE
+HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default;
+
+//------------------------------------------------------------------------------
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print)
+{
+    pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream)
+{
+    pimpl->build_codebook(freq, booklen, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::encode(
+    T*           in_uncompressed,
+    size_t const in_uncompressed_len,
+    BYTE*&       out_compressed,
+    size_t&      out_compressed_len,
+    cudaStream_t stream)
+{
+    pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device)
+{
+    pimpl->decode(in_compressed, out_decompressed, stream, header_on_device);
+}
+
+TEMPLATE_TYPE
+void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); }
+
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); }
+TEMPLATE_TYPE
+float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); }
+
+#undef TEMPLATE_TYPE
+#undef HUFFMAN_COARSE
+
+}  // namespace cusz
+
+#define HUFFCOARSE_CC(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>;
+
+HUFFCOARSE_CC(1, false, 4, 4)  // uint
+HUFFCOARSE_CC(1, false, 8, 4)  //
+HUFFCOARSE_CC(2, false, 4, 4)  //
+HUFFCOARSE_CC(2, false, 8, 4)  //
+HUFFCOARSE_CC(4, false, 4, 4)  //
+HUFFCOARSE_CC(4, false, 8, 4)  //
+
+HUFFCOARSE_CC(4, true, 4, 4)  // float
+HUFFCOARSE_CC(4, true, 8, 4)  //
+
+#undef HUFFCOARSE_CC
diff --git a/qtensor/compression/cusz/src/hf/hf_bookg.cu b/qtensor/compression/cusz/src/hf/hf_bookg.cu
new file mode 100644
index 00000000..9bcb37ba
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_bookg.cu
@@ -0,0 +1,33 @@
+/**
+ * @file hf_bookg.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hf_bookg.inl"
+#include "hf/hf_bookg.hh"
+
+#define PAR_BOOK(T, H) \
+    template void asz::hf_buildbook_g<T, H>(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t);
+
+PAR_BOOK(uint8_t, uint32_t);
+PAR_BOOK(uint16_t, uint32_t);
+PAR_BOOK(uint32_t, uint32_t);
+PAR_BOOK(float, uint32_t);
+
+PAR_BOOK(uint8_t, uint64_t);
+PAR_BOOK(uint16_t, uint64_t);
+PAR_BOOK(uint32_t, uint64_t);
+PAR_BOOK(float, uint64_t);
+
+PAR_BOOK(uint8_t, unsigned long long);
+PAR_BOOK(uint16_t, unsigned long long);
+PAR_BOOK(uint32_t, unsigned long long);
+PAR_BOOK(float, unsigned long long);
+
+#undef PAR_BOOK
diff --git a/qtensor/compression/cusz/src/hf/hf_codecg.cu b/qtensor/compression/cusz/src/hf/hf_codecg.cu
new file mode 100644
index 00000000..54da37f0
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_codecg.cu
@@ -0,0 +1,269 @@
+/**
+ * @file hf_codecg.cu
+ * @author Jiannan Tian
+ * @brief kernel wrappers; launching Huffman kernels
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "detail/hf_codecg.inl"
+#include "hf/hf_bookg.hh"
+#include "hf/hf_codecg.hh"
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse(
+    T*           uncompressed,
+    H*           d_internal_coded,
+    size_t const len,
+    uint32_t*    d_freq,
+    H*           d_book,
+    int const    booklen,
+    H*           d_bitstream,
+    M*           d_par_metadata,
+    M*           h_par_metadata,
+    int const    sublen,
+    int const    pardeg,
+    int          numSMs,
+    uint8_t*&    out_compressed,
+    size_t&      out_compressed_len,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto d_par_nbit  = d_par_metadata;
+    auto d_par_ncell = d_par_metadata + pardeg;
+    auto d_par_entry = d_par_metadata + pardeg * 2;
+
+    auto h_par_nbit  = h_par_metadata;
+    auto h_par_ncell = h_par_metadata + pardeg;
+    auto h_par_entry = h_par_metadata + pardeg * 2;
+
+    CREATE_CUDAEVENT_PAIR;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_internal_coded);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_encode_coarse_rev1(
+    T*            uncompressed,
+    size_t const  len,
+    hf_book*      book_desc,
+    hf_bitstream* bitstream_desc,
+    uint8_t*&     out_compressed,      // 22-10-12 buggy
+    size_t&       out_compressed_len,  // 22-10-12 buggy
+    float&        time_lossless,
+    cudaStream_t  stream)
+{
+    CREATE_CUDAEVENT_PAIR;
+
+    H*        d_buffer    = (H*)bitstream_desc->buffer;
+    H*        d_bitstream = (H*)bitstream_desc->bitstream;
+    H*        d_book      = (H*)book_desc->book;
+    int const booklen     = book_desc->booklen;
+    int const sublen      = bitstream_desc->sublen;
+    int const pardeg      = bitstream_desc->pardeg;
+    int const numSMs      = bitstream_desc->numSMs;
+    // uint32_t* d_freq      = book_desc->freq;
+
+    auto d_par_nbit  = (M*)bitstream_desc->d_metadata->bits;
+    auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells;
+    auto d_par_entry = (M*)bitstream_desc->d_metadata->entries;
+
+    auto h_par_nbit  = (M*)bitstream_desc->h_metadata->bits;
+    auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells;
+    auto h_par_entry = (M*)bitstream_desc->h_metadata->entries;
+
+    /* phase 1 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE;
+        auto grid_dim  = ConfigHelper::get_npart(len, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase1_fill<T, H>                //
+            <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>>  //
+            (uncompressed, len, d_book, booklen, d_buffer);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 2 */
+    {
+        auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;
+        auto grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase2_deflate<H>  //
+            <<<grid_dim, block_dim, 0, stream>>>  //
+            (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+
+    /* phase 3 */
+    {
+        CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M));
+        for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1];  // inclusive scan
+
+        CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    /* phase 4 */
+    {
+        START_CUDAEVENT_RECORDING(stream);
+
+        asz::detail::hf_encode_phase4_concatenate<H, M><<<pardeg, 128, 0, stream>>>  //
+            (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream);
+
+        STOP_CUDAEVENT_RECORDING(stream);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        float stage_time;
+        TIME_ELAPSED_CUDAEVENT(&stage_time);
+        time_lossless += stage_time;
+    }
+}
+
+template <typename T, typename H, typename M>
+void asz::hf_decode_coarse(
+    H*           d_bitstream,
+    uint8_t*     d_revbook,
+    int const    revbook_nbyte,
+    M*           d_par_nbit,
+    M*           d_par_entry,
+    int const    sublen,
+    int const    pardeg,
+    T*           out_decompressed,
+    float&       time_lossless,
+    cudaStream_t stream)
+{
+    auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE;  // = deflating
+    auto const grid_dim  = ConfigHelper::get_npart(pardeg, block_dim);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream)
+
+    hf_decode_kernel<T, H, M>                             //
+        <<<grid_dim, block_dim, revbook_nbyte, stream>>>  //
+        (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed);
+
+    STOP_CUDAEVENT_RECORDING(stream)
+    cudaStreamSynchronize(stream);
+
+    TIME_ELAPSED_CUDAEVENT(&time_lossless);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#define HF_CODEC_INIT(T, H, M)                                                                                     \
+    template void asz::hf_encode_coarse<T, H, M>(                                                                  \
+        T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \
+        float&, cudaStream_t);                                                                                     \
+                                                                                                                   \
+    template void asz::hf_encode_coarse_rev1<T, H, M>(                                                             \
+        T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t);                      \
+                                                                                                                   \
+    template void asz::hf_decode_coarse<T, H, M>(                                                                  \
+        H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t);
+
+HF_CODEC_INIT(uint8_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint32_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint32_t, uint32_t);
+HF_CODEC_INIT(float, uint32_t, uint32_t);
+HF_CODEC_INIT(uint8_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint16_t, uint64_t, uint32_t);
+HF_CODEC_INIT(uint32_t, uint64_t, uint32_t);
+HF_CODEC_INIT(float, uint64_t, uint32_t);
+HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t);
+HF_CODEC_INIT(float, unsigned long long, uint32_t);
+
+#undef HFBOOK_INIT
+#undef HF_CODEC_INIT
diff --git a/qtensor/compression/cusz/src/hf/hf_pimpl.cu b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
new file mode 100644
index 00000000..08a35282
--- /dev/null
+++ b/qtensor/compression/cusz/src/hf/hf_pimpl.cu
@@ -0,0 +1,31 @@
+/**
+ * @file huffman_coarse.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2021-12-17
+ * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * @copyright (C) 2021 by Washington State University, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include "detail/hf_pimpl.inl"
+#include "hf/hf.hh"
+
+#define HUFFCOARSE(E, ETF, H, M) \
+    template class cusz::LosslessCodec<ErrCtrlTrait<E, ETF>::type, HuffTrait<H>::type, MetadataTrait<M>::type>::impl;
+
+HUFFCOARSE(1, false, 4, 4)  // uint
+HUFFCOARSE(1, false, 8, 4)  //
+HUFFCOARSE(2, false, 4, 4)  //
+HUFFCOARSE(2, false, 8, 4)  //
+HUFFCOARSE(4, false, 4, 4)  //
+HUFFCOARSE(4, false, 8, 4)  //
+
+HUFFCOARSE(4, true, 4, 4)  // float
+HUFFCOARSE(4, true, 8, 4)  //
+
+#undef HUFFCOARSE
diff --git a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
new file mode 100644
index 00000000..146a8cd1
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu
@@ -0,0 +1,76 @@
+/**
+ * @file kernel_cuda.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-07-24
+ *
+ * (C) 2022 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/hist.inl"
+#include "detail/spline3.inl"
+// #include "hf/hf_codecg.hh"
+// #include "hf/hf_struct.h"
+#include "kernel/claunch_cuda.h"
+#include "kernel/cpplaunch_cuda.hh"
+#include "utils/cuda_err.cuh"
+
+#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                           \
+    cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                                 \
+    {                                                                                                                \
+        if (NO_R_SEPARATE)                                                                                           \
+            launch_construct_Spline3<T, E, FP, true>(                                                                \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        else                                                                                                         \
+            launch_construct_Spline3<T, E, FP, false>(                                                               \
+                data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                   \
+        return CUSZ_SUCCESS;                                                                                         \
+    }                                                                                                                \
+    cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                         \
+        T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb,   \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                                  \
+    {                                                                                                                \
+        launch_reconstruct_Spline3<T, E, FP>(                                                                        \
+            xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream);                      \
+        return CUSZ_SUCCESS;                                                                                         \
+    }
+
+C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+C_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef C_SPLINE3
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP)                                                    \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_construct_Spline3<T, E, FP>(                                              \
+        bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \
+        double const eb, int const radius, float* time_elapsed, cudaStream_t stream)                            \
+    {                                                                                                           \
+        return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                             \
+            NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);         \
+    }                                                                                                           \
+                                                                                                                \
+    template <>                                                                                                 \
+    cusz_error_status cusz::cpplaunch_reconstruct_Spline3<T, E, FP>(                                            \
+        T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb,  \
+        int const radius, float* time_elapsed, cudaStream_t stream)                                             \
+    {                                                                                                           \
+        return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral(                           \
+            xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream);                       \
+    }
+
+CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float);
+CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float);
+CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float);
+CPP_SPLINE3(fp32, fp32, fp32, float, float, float);
+
+#undef CPP_SPLINE3
diff --git a/qtensor/compression/cusz/src/kernel/detail/hist.inl b/qtensor/compression/cusz/src/kernel/detail/hist.inl
new file mode 100644
index 00000000..1950970d
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/hist.inl
@@ -0,0 +1,100 @@
+/**
+ * @file hist.inl
+ * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu)
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013]
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-16
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_HIST_CUH
+#define CUSZ_KERNEL_HIST_CUH
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <limits>
+
+#include "common.hh"
+#include "utils/timer.h"
+
+#define MIN(a, b) ((a) < (b)) ? (a) : (b)
+const static unsigned int WARP_SIZE = 32;
+
+#define tix threadIdx.x
+#define tiy threadIdx.y
+#define tiz threadIdx.z
+#define bix blockIdx.x
+#define biy blockIdx.y
+#define biz blockIdx.z
+#define bdx blockDim.x
+#define bdy blockDim.y
+#define bdz blockDim.z
+
+namespace kernel {
+
+template <typename Input>
+__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread);
+
+/* Copied from J. Gomez-Luna et al */
+template <typename T, typename FREQ>
+__global__ void p2013Histogram(T*, FREQ*, size_t, int, int);
+
+}  // namespace kernel
+
+template <typename T>
+__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread)
+{
+    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int j;
+    if (i * symbols_per_thread < N) {  // if there is a symbol to count,
+        for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) {
+            if (j < N) {
+                unsigned int item = in_data[j];  // Symbol to count
+                atomicAdd(&out_freq[item], 1);   // update bin count by 1
+            }
+        }
+    }
+}
+
+template <typename T, typename FREQ>
+__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R)
+{
+    // static_assert(
+    //     std::numeric_limits<T>::is_integer and (not std::numeric_limits<T>::is_signed),
+    //     "T must be `unsigned integer` type of {1,2,4} bytes");
+
+    extern __shared__ int Hs[/*(nbin + 1) * R*/];
+
+    const unsigned int warp_id     = (int)(tix / WARP_SIZE);
+    const unsigned int lane        = tix % WARP_SIZE;
+    const unsigned int warps_block = bdx / WARP_SIZE;
+    const unsigned int off_rep     = (nbin + 1) * (tix % R);
+    const unsigned int begin       = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane;
+    unsigned int       end         = (N / warps_block) * (warp_id + 1);
+    const unsigned int step        = WARP_SIZE * gridDim.x;
+
+    // final warp handles data outside of the warps_block partitions
+    if (warp_id >= warps_block - 1) end = N;
+
+    for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0;
+    __syncthreads();
+
+    for (unsigned int i = begin; i < end; i += step) {
+        int d = in_data[i];
+        d     = d <= 0 and d >= nbin ? nbin / 2 : d;
+        atomicAdd(&Hs[off_rep + d], 1);
+    }
+    __syncthreads();
+
+    for (unsigned int pos = tix; pos < nbin; pos += bdx) {
+        int sum = 0;
+        for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; }
+        atomicAdd(out_freq + pos, sum);
+    }
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
new file mode 100644
index 00000000..28fd3bdc
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl
@@ -0,0 +1,816 @@
+/**
+ * @file lorenzo.inl
+ * @author Jiannan Tian
+ * @brief Dual-ErrCtrl Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_CUH
+#define CUSZ_KERNEL_LORENZO_CUH
+
+#include <cstddef>
+// #include "utils/cuda_err.cuh"
+// #include "utils/timer.h"
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+using DIM    = unsigned int;
+using STRIDE = unsigned int;
+
+namespace cusz {
+
+/**
+ * @brief compress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float, int BLOCK = 256, int SEQ = 4>
+__global__ void
+c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r);
+
+/**
+ * @brief compress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief compress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param data input
+ * @param errctrl output 1
+ * @param outlier output 2
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2_r precalculated reciprocal of eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* errctrl,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r);
+
+/**
+ * @brief decompress-time 1D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP = float,
+    int BLOCK   = 256,
+    int SEQ     = 8>
+__global__ void x_lorenzo_1d1l(
+    Data*    outlier,  //
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 2D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+/**
+ * @brief decompress-time 3D Lorenzo pred-quant kernel (variant)
+ *
+ * @tparam Data type of input data
+ * @tparam ErrCtrl type of error-control code
+ * @tparam FP type for internal floating-point processing
+ * @tparam BLOCK block size
+ * @tparam SEQ degree of sequentiality
+ * @param outlier input 1
+ * @param quant input 2
+ * @param xdata output
+ * @param len3 data length in 3D
+ * @param stride3 data stride in 3D
+ * @param radius quant-code radius
+ * @param ebx2 precalculated eb*2
+ */
+template <typename Data, typename ErrCtrl, typename FP = float>
+__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2);
+
+}  // namespace cusz
+
+namespace {
+
+/**
+ * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier
+ */
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void pred1d_radius_separate(
+    Data              thread_scope[SEQ],
+    volatile Data*    shmem_data,
+    volatile ErrCtrl* shmem_quant,
+    int               radius,
+    Data              from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                 = thread_scope[0] - from_last_stripe;
+        bool quantizable           = fabs(delta) < radius;
+        Data candidate             = delta + radius;
+        shmem_data[0 + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        shmem_quant[0 + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                 = thread_scope[i] - thread_scope[i - 1];
+            bool quantizable           = fabs(delta) < radius;
+            Data candidate             = delta + radius;
+            shmem_data[i + TIX * SEQ]  = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            shmem_quant[i + TIX * SEQ] = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int NTHREAD, int SEQ, bool NO_R_SEPARATE>
+__forceinline__ __device__ void write1d(
+    volatile Data*    shmem_data,
+    Data*             data,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_quant = nullptr,
+    ErrCtrl*          quant       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            if CONSTEXPR (NO_R_SEPARATE) {  // TODO no-radius-separate uses shmem_data
+                quant[id] = shmem_data[TIX + i * NTHREAD];
+            }
+            else {
+                data[id]  = shmem_data[TIX + i * NTHREAD];
+                quant[id] = shmem_quant[TIX + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     quant,
+    Data*        outlier,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    int          radius,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            bool quantizable = fabs(center[i]) < radius;
+            Data candidate   = center[i] + radius;
+            outlier[gid]     = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[gid]       = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+}
+
+}  // namespace
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void cusz::c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)];
+            Data    data[BLOCK];
+        } space;
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r);
+
+    // the original SZ/cuSZ design
+    auto shmem_quant = reinterpret_cast<ErrCtrl*>(shmem.space.uninitialized + sizeof(Data) * BLOCK);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, true>(
+        thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe);
+    pred1d_radius_separate<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.space.data, shmem_quant, radius);
+    write1d<Data, ErrCtrl, NTHREAD, SEQ, false>(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ>(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,
+    ErrCtrl* quant,
+    Data*    outlier,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta;
+
+        /********************************************************************************
+         * prediction
+         ********************************************************************************/
+        delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                    - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                    - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                    - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                    + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                    + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                    + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        bool quantizable = fabs(delta) < radius;
+        Data candidate   = delta + radius;
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            outlier[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+            quant[id]   = quantizable * static_cast<ErrCtrl>(candidate);
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data, typename ErrCtrl, typename FP, int BLOCK, int SEQ>
+__global__ void cusz::x_lorenzo_1d1l(  //
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<Data, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_outlier;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_quant;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_outlier::TempStorage load_outlier;
+        typename BlockLoadT_quant::TempStorage   load_quant;
+        typename BlockStoreT_xdata::TempStorage  store_xdata;
+        typename BlockScanT_xdata::TempStorage   scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    union ThreadData {
+        Data xdata[SEQ];
+        Data outlier[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_quant[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] =
+            id < len3.x ? thread_scope.outlier[i] + static_cast<Data>(thread_scope_quant[i]) - radius : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = outlier[gid] + static_cast<Data>(quant[gid]) - radius;  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+/********************************************************************************
+ * experimental prototype toward further optmization
+ ********************************************************************************/
+template <typename Data, typename ErrCtrl, typename FP>
+__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8(
+    Data*    outlier,
+    ErrCtrl* quant,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    int      radius,
+    FP       ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope = 0;
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto y = 0;
+
+    // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+#pragma unroll
+    for (y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope += outlier[gid] + static_cast<Data>(quant[gid]) - static_cast<Data>(radius);  // fuse
+
+        Data val = thread_scope;
+
+        // shuffle, ND partial-sums
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (auto dist = 1; dist < BLOCK; dist *= 2) {
+            Data addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        // thread_scope += val;
+
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; }
+    }
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
new file mode 100644
index 00000000..83a52b4b
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl
@@ -0,0 +1,1237 @@
+/**
+ * @file lorenzo23.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "subroutine.inl"
+
+namespace subr = psz::cuda::__device;
+
+namespace psz {
+namespace cuda {
+namespace __kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D
+
+namespace v0 {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}  // namespace compaction
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+////////////////////////////////////////////////////////////////////////////////
+// 3D
+
+namespace v0 {
+
+// TODO -> `legacy`
+namespace legacy {
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+
+template <typename T, typename EQ, typename FP, typename Compaction = CompactionDRAM<T>>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier);
+
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+namespace delta_only {
+
+template <typename T, typename EQ, typename FP>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant);
+
+template <typename T, typename EQ, typename FP>
+__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+// 1D definition
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v0::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, true>(thp_data, s.quant, prev);
+    subr_v0::predict_quantize__no_outlier_1d<T, EQ, SEQ, false>(thp_data, s.quant);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier_desc)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v0c = psz::cuda::__device::v0::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev);
+    subr_v0c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, len3.x, radius, id_base, outlier_desc);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, true>(s.quant, nullptr, len3.x, id_base, quant, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l(  //
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1c = psz::cuda::__device::v1_pn::compaction;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        union {
+            T data[BLOCK];
+            T outlier[BLOCK];
+        };
+        EQ quant[BLOCK];
+    } s;
+
+    T prev{0};
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_prequant_1d<T, FP, NTHREAD, SEQ>(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, true>(thp_data, s.quant, s.outlier, radius, prev);
+    subr_v1c::predict_quantize_1d<T, EQ, SEQ, false>(thp_data, s.quant, s.outlier, radius);
+    subr_v0::write_1d<EQ, T, NTHREAD, SEQ, false>(s.quant, s.outlier, len3.x, id_base, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+    namespace wave32  = psz::cuda::__device::wave32;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        union {
+            T outlier[BLOCK];
+            T xdata[BLOCK];
+        };
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::load_fuse_1d<T, EQ, NTHREAD, SEQ>(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+template <typename T, typename EQ, typename FP, int BLOCK, int SEQ>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto NTHREAD = BLOCK / SEQ;  // equiv. to blockDim.x
+
+    __shared__ struct {
+        T xdata[BLOCK];
+        // even if it's wave64, "/32" works
+        T exchange_in[NTHREAD / 32];
+        T exchange_out[NTHREAD / 32];
+    } s;
+
+    T thp_data[SEQ];
+
+    auto id_base = blockIdx.x * BLOCK;
+
+    subr_v0::delta_only::load_1d<T, EQ, NTHREAD, SEQ>(quant, len3.x, id_base, s.xdata, thp_data);
+    subr_v0::block_scan_1d<T, SEQ, NTHREAD>(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata);
+    subr_v0::write_1d<T, T, NTHREAD, SEQ, true>(s.xdata, nullptr, len3.x, id_base, xdata, nullptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// 2D definition
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::delta_only::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant)
+{
+    namespace subr_v0  = psz::cuda::__device::v0;
+    namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v1d::quantize_write_2d<T, EQ, YSEQ>(center, len3.x, gix, len3.y, giy_base, stride3.y, quant);
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    T center[YSEQ + 1] = {0};  // NW  N       first element <- 0
+                               //  W  center
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;         // BDX == BLOCK == 16
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    subr_v0::load_prequant_2d<T, FP, YSEQ>(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center);
+    subr_v0::predict_2d<T, EQ, YSEQ>(center);
+    subr_v0::compaction::quantize_write_2d<T, EQ, YSEQ>(
+        center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::load_fuse_2d<T, EQ, YSEQ>(
+        quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::load_fuse_2d<T, EQ, YSEQ>(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0 = psz::cuda::__device::v0;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v0::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+// 16x16 data block maps to 16x2 (one warp) thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    namespace subr_v0    = psz::cuda::__device::v0;
+    namespace subr_v1_pn = psz::cuda::__device::v1_pn;
+
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ T intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    T            thread_private[YSEQ];
+
+    auto gix      = blockIdx.x * BLOCK + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ;  // BDY * YSEQ = BLOCK == 16
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    subr_v1_pn::delta_only::load_2d<T, EQ, YSEQ>(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private);
+    subr_v0::block_scan_2d<T, EQ, FP, YSEQ>(thread_private, intermediate, ebx2);
+    subr_v0::decomp_write_2d<T, YSEQ>(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata);
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l(
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2_r,
+    EQ*  quant,
+    T*   outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[8][8][32];
+
+    auto z = threadIdx.z;
+
+    auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giz      = blockIdx.z * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    auto giy = [&](auto y) { return giy_base + y; };
+    auto gid = [&](auto y) { return base_id + y * stride3.y; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giz < len3.z) {
+            for (auto y = 0; y < BLOCK; y++)
+                if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    auto x = threadIdx.x % 8;
+
+    auto predict_3d = [&](auto y) {
+        T delta = s[z][y][threadIdx.x] -                                               //
+                  ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0)  // dist=3
+                   - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0)              // dist=2
+                   - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0)              //
+                   - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0)              //
+                   + (x > 0 ? s[z][y][threadIdx.x - 1] : 0)                            // dist=1
+                   + (y > 0 ? s[z][y - 1][threadIdx.x] : 0)                            //
+                   + (z > 0 ? s[z - 1][y][threadIdx.x] : 0));                          //
+        return delta;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+    for (auto y = 0; y < BLOCK; y++) {
+        auto delta = predict_3d(y);
+        quantize_write(delta, gix, giy(y), giz, gid(y));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void
+psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    /* z-direction, sequential in private buffer
+       delta = + (s[z][y][x] - s[z-1][y][x])
+               - (s[z][y][x-1] - s[z-1][y][x-1])
+               + (s[z][y-1][x-1] - s[z-1][y-1][x-1])
+               - (s[z][y-1][x] - s[z-1][y-1][x])
+
+       x-direction, shuffle
+       delta = + (s[z][y][x] - s[z][y][x-1])
+               - (s[z][y-1][x] - s[z][y-1][x-1])
+
+       y-direction, shmem
+       delta = s[z][y][x] - s[z][y-1][x]
+     */
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast<EQ>(delta);
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l(  //
+    T*   data,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2_r,
+    EQ*  quant)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+template <typename T, typename EQ, typename FP, typename Compaction>
+__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l(
+    T*         data,
+    dim3       len3,
+    dim3       stride3,
+    int        radius,
+    FP         ebx2_r,
+    EQ*        quant,
+    Compaction outlier)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    __shared__ T   s[9][33];
+    T              delta[BLOCK + 1] = {0};  // first el = 0
+
+    const auto gix      = blockIdx.x * (BLOCK * 4) + threadIdx.x;
+    const auto giy      = blockIdx.y * BLOCK + threadIdx.y;
+    const auto giz_base = blockIdx.z * BLOCK;
+    const auto base_id  = gix + giy * stride3.y + giz_base * stride3.z;
+
+    auto giz = [&](auto z) { return giz_base + z; };
+    auto gid = [&](auto z) { return base_id + z * stride3.z; };
+
+    // TODO move to subroutine.inl
+    auto load_prequant_3d = [&]() {
+        if (gix < len3.x and giy < len3.y) {
+            for (auto z = 0; z < BLOCK; z++)
+                if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r);  // prequant (fp presence)
+        }
+        __syncthreads();
+    };
+
+    auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) {
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        T candidate = delta + radius;
+        if (x < len3.x and y < len3.y and z < len3.z) {
+            quant[gid] = quantizable * UI_delta;
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = UI_delta;
+            }
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    load_prequant_3d();
+
+    for (auto z = BLOCK; z > 0; z--) {
+        // z-direction
+        delta[z] -= delta[z - 1];
+
+        // x-direction
+        auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8);
+        if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x;
+
+        // y-direction, exchange via shmem
+        // ghost padding along y
+        s[threadIdx.y + 1][threadIdx.x] = delta[z];
+        __syncthreads();
+
+        delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x];
+
+        // now delta[z] is delta
+        quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1));
+    }
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    int  radius,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + static_cast<T>(quant[gid(y)]) - radius;  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    T*   outlier,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_fuse_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = outlier[gid(y)] + PN<BYTEWIDTH>::decode(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_fuse_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
+
+// 32x8x8 data block maps to 32x1x8 thread block
+template <typename T, typename EQ, typename FP>
+__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l(  //
+    EQ*  quant,
+    dim3 len3,
+    dim3 stride3,
+    FP   ebx2,
+    T*   xdata)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ T intermediate[BLOCK][4][8];
+    T            thread_private[YSEQ];
+
+    auto seg_id  = threadIdx.x / 8;
+    auto seg_tix = threadIdx.x % 8;
+
+    auto gix      = blockIdx.x * (4 * BLOCK) + threadIdx.x;
+    auto giy_base = blockIdx.y * BLOCK;
+    auto giy      = [&](auto y) { return giy_base + y; };
+    auto giz      = blockIdx.z * BLOCK + threadIdx.z;
+    auto gid      = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    auto load_3d = [&]() {
+    // load to thread-private array (fuse at the same time)
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++) {
+            if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+                thread_private[y] = static_cast<T>(quant[gid(y)]);  // fuse
+            else
+                thread_private[y] = 0;
+        }
+    };
+
+    auto block_scan_3d = [&]() {
+        // partial-sum along y-axis, sequentially
+        for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1];
+
+#pragma unroll
+        for (auto i = 0; i < BLOCK; i++) {
+            // ND partial-sums along x- and z-axis
+            // in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+            T val = thread_private[i];
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            // x-z transpose
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            for (auto dist = 1; dist < BLOCK; dist *= 2) {
+                auto addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+                if (seg_tix >= dist) val += addend;
+            }
+
+            intermediate[threadIdx.z][seg_id][seg_tix] = val;
+            __syncthreads();
+            val = intermediate[seg_tix][seg_id][threadIdx.z];
+            __syncthreads();
+
+            thread_private[i] = val;
+        }
+    };
+
+    auto decomp_write_3d = [&]() {
+#pragma unroll
+        for (auto y = 0; y < YSEQ; y++)
+            if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    load_3d();
+    block_scan_3d();
+    decomp_write_3d();
+}
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
new file mode 100644
index 00000000..5a317a60
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl
@@ -0,0 +1,214 @@
+/**
+ * @file lorenzo_proto.inl
+ * @author Jiannan Tian
+ * @brief (prototype) Dual-EQ Lorenzo method.
+ * @version 0.2
+ * @date 2021-01-16
+ * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11
+ * (rev4) 2021-04-30
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH
+
+#include <cstddef>
+#include <stdexcept>
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace psz {
+
+namespace cuda {
+namespace __kernel {
+
+namespace prototype {  // easy algorithmic description
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < len3.x) {
+        buf[threadIdx.x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]);
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (id < len3.x) {                             // postquant
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto y = threadIdx.y, x = threadIdx.x;
+    auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y;  // low to high dim, inner to outer
+    if (gix < len3.x and giy < len3.y) {
+        buf[y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) +                // dist=1
+                           (y > 0 ? buf[y - 1][x] : 0) -                // dist=1
+                           (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0));  // dist=2
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ, typename FP, int BLK = 8>
+__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x;
+    auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x;
+
+    auto id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        buf[z][y][x] = round(data[id] * ebx2_r);  // prequant (fp presence)
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0)  // dist=3
+                              - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0)              // dist=2
+                              - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0)              //
+                              - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0)              //
+                              + (x > 0 ? buf[z][y][x - 1] : 0)                            // dist=1
+                              + (y > 0 ? buf[z][y - 1][x] : 0)                            //
+                              + (z > 0 ? buf[z - 1][y][x] : 0));                          //
+
+    bool quantizable = fabs(delta) < radius;
+    T    candidate   = delta + radius;
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        data[id] = (1 - quantizable) * candidate;  // output; reuse data for outlier
+        eq[id]   = quantizable * static_cast<EQ>(candidate);
+    }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK];
+
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id < len3.x)
+        buf[threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.x - d];  // like __shfl_up_sync(0x1f, var, d); warp_sync
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK + 1];
+
+    auto   giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t id = gix + giy * stride3.y;
+
+    if (gix < len3.x and giy < len3.y)
+        buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // fuse
+    else
+        buf[threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d];
+        __syncthreads();
+        if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    for (auto d = 1; d < BLK; d *= 2) {
+        T n = 0;
+        if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; }
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    __shared__ T buf[BLK][BLK][BLK + 1];
+
+    auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y,
+         gix  = blockIdx.x * BLK + threadIdx.x;
+    size_t id = gix + giy * stride3.y + giz * stride3.z;  // low to high in dim, inner to outer
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z)
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast<T>(eq[id]) - radius;  // id
+    else
+        buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+    __syncthreads();
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist];
+        __syncthreads();
+        if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    for (auto dist = 1; dist < BLK; dist *= 2) {
+        T addend = 0;
+        if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x];
+        __syncthreads();
+        if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend;
+        __syncthreads();
+    }
+
+    if (gix < len3.x and giy < len3.y and giz < len3.z) {
+        xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2;
+    }
+}
+
+}  // namespace prototype
+}  // namespace __kernel
+}  // namespace cuda
+}  // namespace psz
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
new file mode 100644
index 00000000..e82013d5
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl
@@ -0,0 +1,326 @@
+/**
+ * @file lorenzo_serial.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-13
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6
+
+#include <iostream>
+#include "cusz/it.hh"
+#include "cusz/nd.h"
+
+using std::cout;
+using std::endl;
+
+#define SETUP_1D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gidx           = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     check_boundary = [&]() { return gx() < len3.x; };                                \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;
+#define SETUP_1D_DATABUF                                            \
+    constexpr auto PADDING    = 1;                                  \
+    auto           _buf1      = new psz_buf<T, 1, BLK + PADDING>(); \
+    auto&          buf1       = *_buf1;                             \
+    auto           databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); };
+#define SETUP_1D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 1, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); };
+#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_2D_BASIC                                                                        \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                 \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \
+    auto     gidx           = [&]() -> uint32_t { return gy() * stride3.y + gx(); };          \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; };              \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                         \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;
+#define SETUP_2D_DATABUF                                                                 \
+    constexpr auto PADDING    = 1;                                                       \
+    auto           _buf1      = new psz_buf<T, 2, BLK + PADDING>();                      \
+    auto&          buf1       = *_buf1;                                                  \
+    auto           databuf_it = [&](auto dx, auto dy) -> T& {                            \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \
+    };
+#define SETUP_2D_EQBUF                          \
+    auto  _buf2    = new psz_buf<EQ, 2, BLK>(); \
+    auto& buf2     = *_buf2;                    \
+    auto  eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); };
+#define PFOR_GRID_2D()                                             \
+    for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+        for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_2D()                                        \
+    for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+        for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+#define SETUP_3D_BASIC                                                                                  \
+    psz_dim3 grid_dim, block_idx, thread_idx;                                                           \
+    auto     gx             = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; };           \
+    auto     gy             = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; };           \
+    auto     gz             = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; };           \
+    auto     gidx           = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \
+    auto     check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; };      \
+    grid_dim.x              = (len3.x - 1) / BLK + 1;                                                   \
+    grid_dim.y              = (len3.y - 1) / BLK + 1;                                                   \
+    grid_dim.z              = (len3.z - 1) / BLK + 1;
+#define SETUP_3D_DATABUF                                                                                              \
+    constexpr auto PADDING    = 1;                                                                                    \
+    auto           _buf1      = new psz_buf<T, 3, BLK + PADDING>();                                                   \
+    auto&          buf1       = *_buf1;                                                                               \
+    auto           databuf_it = [&](auto dx, auto dy, auto dz) -> T& {                                                \
+        return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \
+    };
+#define SETUP_3D_EQBUF                                                         \
+    auto  _buf2    = new psz_buf<EQ, 3, BLK>();                                \
+    auto& buf2     = *_buf2;                                                   \
+    auto  eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& {                   \
+        return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \
+    };
+#define PFOR_GRID_3D()                                                 \
+    for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++)     \
+        for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \
+            for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++)
+#define PFOR_BLOCK_3D()                                            \
+    for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++)     \
+        for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \
+            for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++)
+
+namespace psz {
+namespace serial {
+namespace __kernel {
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 256,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+    SETUP_1D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0) - databuf_it(-1);
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0) = 0;
+        }
+        else {
+            eqbuf_it(0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 256>
+void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_1D_BASIC;
+    SETUP_1D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum = [&]() {
+        if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); }
+    PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 16,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+    SETUP_2D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 16>
+void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_2D_BASIC;
+    SETUP_2D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); }
+    PFOR_GRID_2D()
+    {
+        PFOR_BLOCK_2D() threadview_partial_sum_x();
+        PFOR_BLOCK_2D() threadview_partial_sum_y();
+    }
+    PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); }
+
+    delete _buf1;
+}
+
+template <
+    typename T,
+    typename EQ      = int32_t,
+    typename FP      = T,
+    int BLK          = 8,
+    typename OUTLIER = struct psz_outlier_serial<T>>
+void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) {
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+    SETUP_3D_EQBUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r;
+    };
+    auto threadview_process = [&]() {
+        auto delta = databuf_it(0, 0, 0) -
+                     (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) +
+                      databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0));
+        if (delta > radius) {
+            outlier->record(delta, gidx());
+            eqbuf_it(0, 0, 0) = 0;
+        }
+        else {
+            eqbuf_it(0, 0, 0) = delta;
+        }
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0);
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+    delete _buf2;
+}
+
+template <typename T, typename EQ = int32_t, typename FP = T, int BLK = 8>
+void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata)
+{
+    SETUP_3D_BASIC;
+    SETUP_3D_DATABUF;
+
+    // per-thread ("real" kernel)
+    auto threadview_load = [&]() {
+        if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()];
+    };
+    auto threadview_partial_sum_x = [&]() {
+        if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0);
+    };
+    auto threadview_partial_sum_y = [&]() {
+        if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0);
+    };
+    auto threadview_partial_sum_z = [&]() {
+        if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1);
+    };
+    auto threadview_store = [&]() {
+        if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2;
+    };
+
+    ////////////////////////////////////////
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); }
+    PFOR_GRID_3D()
+    {
+        PFOR_BLOCK_3D() threadview_partial_sum_x();
+        PFOR_BLOCK_3D() threadview_partial_sum_y();
+        PFOR_BLOCK_3D() threadview_partial_sum_z();
+    }
+    PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); }
+
+    delete _buf1;
+}
+
+}  // namespace __kernel
+}  // namespace serial
+}  // namespace psz
+
+#undef SETUP_1D
+#undef PFOR_GRID_1D
+#undef PFOR_BLOCK_1D
+#undef SETUP_2D_BASIC
+#undef PFOR_GRID_2D
+#undef PFOR_BLOCK_2D
+#undef SETUP_3D
+#undef PFOR_GRID_3D
+#undef PFOR_BLOCK_3D
+
+#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
new file mode 100644
index 00000000..2f58d1ad
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl
@@ -0,0 +1,530 @@
+/**
+ * @file lorenzo_var.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762
+
+#include <cstddef>
+
+#if __has_include(<cub/cub.cuh>)
+// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path"
+#include <cub/cub.cuh>
+#else
+// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule"
+#include "../../third_party/cub/cub/cub.cuh"
+#endif
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.z
+
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+namespace cusz {
+namespace experimental {
+
+template <typename Data, typename ErrCtrl, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0)
+{
+    if CONSTEXPR (FIRST_POINT) {  // i == 0
+        Data delta                  = thread_scope[0] - from_last_stripe;
+        shmem_signum[0 + TIX * SEQ] = delta < 0;  // signnum
+        shmem_delta[0 + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) {
+            Data delta                  = thread_scope[i] - thread_scope[i - 1];
+            shmem_signum[i + TIX * SEQ] = delta < 0;  // signum
+            shmem_delta[i + TIX * SEQ]  = static_cast<ErrCtrl>(fabs(delta));
+        }
+        __syncthreads();
+    }
+}
+
+template <typename Data, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load1d(
+    Data*          data,
+    unsigned int   dimx,
+    unsigned int   id_base,
+    volatile Data* shmem_data,
+    Data           thread_scope[SEQ],
+    Data&          from_last_stripe,
+    FP             ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); }
+    }
+    __syncthreads();
+
+    for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i];
+
+    if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename ErrCtrl, int NTHREAD, int SEQ>
+__forceinline__ __device__ void write1d(
+    volatile bool*    shmem_signum,
+    bool*             signum,
+    unsigned int      dimx,
+    unsigned int      id_base,
+    volatile ErrCtrl* shmem_delta = nullptr,
+    ErrCtrl*          delta       = nullptr)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + TIX + i * NTHREAD;
+        if (id < dimx) {
+            signum[id] = shmem_signum[TIX + i * NTHREAD];
+            delta[id]  = shmem_delta[TIX + i * NTHREAD];
+        }
+    }
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void load2d_prequant(
+    Data*        data,
+    Data         center[YSEQ + 1],
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base,
+    FP           ebx2_r)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16);  // same-warp, next-16
+    if (TIY == 1) center[0] = tmp;
+}
+
+template <typename Data, typename FP, int YSEQ>
+__forceinline__ __device__ void pred2d(Data center[YSEQ + 1])
+{
+    /* prediction
+         original form:  Data delta = center[i] - center[i - 1] + west[i] - west[i - 1];
+            short form:  Data delta = center[i] - west[i];
+       */
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        center[i] -= center[i - 1];
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (TIX > 0) center[i] -= west;
+    }
+    __syncthreads();
+}
+
+template <typename Data, typename ErrCtrl, int YSEQ>
+__forceinline__ __device__ void postquant_write2d(
+    Data         center[YSEQ + 1],
+    ErrCtrl*     delta,
+    bool*        signum,
+    unsigned int dimx,
+    unsigned int dimy,
+    unsigned int stridey,
+    unsigned int gix,
+    unsigned int giy_base)
+{
+    /********************************************************************************
+     * Depending on whether postquant is delayed in compression, deside separating
+     * data-type signum and uint-type quantcode when writing to DRAM (or not).
+     ********************************************************************************/
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + i - 1 < dimy) {
+            signum[gid] = center[i] < 0;  // output; reuse data for signum
+            delta[gid]  = static_cast<ErrCtrl>(fabs(center[i]));
+        }
+    }
+}
+
+template <
+    typename Data,
+    typename ErrCtrl,
+    typename FP,
+    int BLOCK,
+    int SEQ>
+__global__ void c_lorenzo_1d1l(  //
+    Data*    data,
+    ErrCtrl* delta,
+    bool*    signum,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto NTHREAD = BLOCK / SEQ;
+
+    __shared__ struct {
+        Data    data[BLOCK];
+        ErrCtrl delta[BLOCK];
+        bool    signum[BLOCK];
+    } shmem;
+
+    auto id_base = BIX * BLOCK;
+
+    Data thread_scope[SEQ];
+    Data from_last_stripe{0};
+
+    /********************************************************************************
+     * load from DRAM using striped layout, perform prequant
+     ********************************************************************************/
+    load1d<Data, FP, NTHREAD, SEQ>(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r);
+
+    /********************************************************************************
+     * delta and signum
+     ********************************************************************************/
+    pred1d<Data, ErrCtrl, SEQ, true>(thread_scope, shmem.signum, shmem.delta, from_last_stripe);
+    pred1d<Data, ErrCtrl, SEQ, false>(thread_scope, shmem.signum, shmem.delta);
+    write1d<ErrCtrl, NTHREAD, SEQ>(shmem.signum, signum, len3.x, id_base, shmem.delta, delta);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_2d1l_16x16data_mapto16x2(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = 8;
+
+    Data center[YSEQ + 1] = {0};  // nw  n
+                                  //  w  center
+
+    auto gix      = BIX * BDX + TIX;           // BDX == 16
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+                                               // clang-format off
+    load2d_prequant<Data, FP, YSEQ>(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r);
+    pred2d<Data, FP, YSEQ>(center);
+    postquant_write2d<Data, ErrCtrl, YSEQ >(center, delta, signum, len3.x, len3.y, stride3.y,  gix, giy_base);
+    // clang-format on
+}
+
+template <typename Data, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8(
+    Data*    data,    // input
+    ErrCtrl* delta,   // output
+    bool*    signum,  // output
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2_r)
+{
+    constexpr auto  BLOCK = 8;
+    __shared__ Data shmem[8][8][32];
+
+    auto z = TIZ;
+
+    auto gix      = BIX * (BLOCK * 4) + TIX;
+    auto giy_base = BIY * BLOCK;
+    auto giz      = BIZ * BLOCK + z;
+    auto base_id  = gix + giy_base * stride3.y + giz * stride3.z;
+
+    /********************************************************************************
+     * load from DRAM, perform prequant
+     ********************************************************************************/
+    if (gix < len3.x and giz < len3.z) {
+        for (auto y = 0; y < BLOCK; y++) {
+            if (giy_base + y < len3.y) {
+                shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r);  // prequant (fp presence)
+            }
+        }
+    }
+    __syncthreads();  // necessary to ensure correctness
+
+    auto x = TIX % 8;
+
+    for (auto y = 0; y < BLOCK; y++) {
+        Data delta_val;
+
+        // prediction
+        delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0)  // dist=3
+                                        - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0)              // dist=2
+                                        - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0)              //
+                                        - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0)              //
+                                        + (x > 0 ? shmem[z][y][TIX - 1] : 0)                            // dist=1
+                                        + (y > 0 ? shmem[z][y - 1][TIX] : 0)                            //
+                                        + (z > 0 ? shmem[z - 1][y][TIX] : 0));                          //
+
+        auto id = base_id + (y * stride3.y);
+
+        // delta and signum
+        if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) {
+            signum[id] = delta_val < 0;
+            delta[id]  = static_cast<ErrCtrl>(fabs(delta_val));
+        }
+    }
+    /* EOF */
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float, int BLOCK = 256, int SEQ = 8>
+__global__ void x_lorenzo_1d1l(  //
+    bool*    signum,
+    ErrCtrl* delta,
+    Data*    xdata,
+    dim3     len3,
+    dim3     stride3,
+    FP       ebx2)
+{
+    constexpr auto block_dim = BLOCK / SEQ;  // dividable
+
+    // coalesce-load (warp-striped) and transpose in shmem (similar for store)
+    typedef cub::BlockLoad<bool, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE>    BlockLoadT_signum;
+    typedef cub::BlockLoad<ErrCtrl, block_dim, SEQ, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT_delta;
+    typedef cub::BlockStore<Data, block_dim, SEQ, cub::BLOCK_STORE_WARP_TRANSPOSE>  BlockStoreT_xdata;
+    typedef cub::BlockScan<Data, block_dim, cub::BLOCK_SCAN_RAKING_MEMOIZE>
+        BlockScanT_xdata;  // TODO autoselect algorithm
+
+    __shared__ union TempStorage {  // overlap shared memory space
+        typename BlockLoadT_signum::TempStorage load_signum;
+        typename BlockLoadT_delta::TempStorage  load_delta;
+        typename BlockStoreT_xdata::TempStorage store_xdata;
+        typename BlockScanT_xdata::TempStorage  scan_xdata;
+    } temp_storage;
+
+    // thread-scope tiled data
+    struct ThreadData {
+        Data xdata[SEQ];
+        bool signum[SEQ];
+    } thread_scope;
+    ErrCtrl thread_scope_delta[SEQ];
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block
+     ********************************************************************************/
+    BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta);
+    __syncthreads();  // barrier for shmem reuse
+    BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum);
+    __syncthreads();  // barrier for shmem reuse
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id               = (BIX * BDX + TIX) * SEQ + i;
+        thread_scope.xdata[i] = id < len3.x  //
+                                    ? (thread_scope.signum[i] ? -1 : 1) * static_cast<Data>(thread_scope_delta[i])
+                                    : 0;
+    }
+    __syncthreads();
+
+    /********************************************************************************
+     * perform partial-sum using cub::InclusiveSum
+     ********************************************************************************/
+    BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata);
+    __syncthreads();  // barrier for shmem reuse
+
+    /********************************************************************************
+     * scale by ebx2 and write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2;
+    __syncthreads();  // barrier for shmem reuse
+
+    BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata);
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 16;
+    constexpr auto YSEQ  = BLOCK / 2;  // sequentiality in y direction
+    static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16");
+
+    __shared__ Data intermediate[BLOCK];  // TODO use warp shuffle to eliminate this
+    Data            thread_scope[YSEQ];
+    /*
+      .  ------> gix (x)
+      |  t00    t01    t02    t03    ... t0f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+
+      |  t10    t11    t12    t13    ... t1f
+      |  ts00_0 ts00_0 ts00_0 ts00_0
+     giy ts00_1 ts00_1 ts00_1 ts00_1
+     (y)  |      |      |      |
+         ts00_7 ts00_7 ts00_7 ts00_7
+     */
+
+    auto gix      = BIX * BLOCK + TIX;
+    auto giy_base = BIY * BLOCK + TIY * YSEQ;  // BDY * YSEQ = BLOCK == 16
+    auto get_gid  = [&](auto i) { return (giy_base + i) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < len3.x and giy_base + i < len3.y)
+            thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);  // fuse
+        else
+            thread_scope[i] = 0;  // TODO set as init state?
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1];
+    // two-pass: store for cross-threadscope update
+    if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1];
+    __syncthreads();
+    // two-pass: load and update
+    if (TIY == 1) {
+        auto tmp = intermediate[TIX];
+#pragma unroll
+        for (auto& i : thread_scope) i += tmp;
+    }
+
+    /********************************************************************************
+     * in-warp partial-sum along x-axis
+     ********************************************************************************/
+#pragma unroll
+    for (auto& i : thread_scope) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            Data n = __shfl_up_sync(0xffffffff, i, d, 16);
+            if (TIX >= d) i += n;
+        }
+        i *= ebx2;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i];
+    }
+}
+
+template <typename Data = float, typename ErrCtrl = uint16_t, typename FP = float>
+__global__ void
+x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2)
+{
+    constexpr auto BLOCK = 8;
+    constexpr auto YSEQ  = BLOCK;
+    static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8");
+
+    __shared__ Data intermediate[BLOCK][4][8];
+    Data            thread_scope[YSEQ];
+
+    auto seg_id  = TIX / 8;
+    auto seg_tix = TIX % 8;
+
+    auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ;
+    auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; };
+
+    /********************************************************************************
+     * load to thread-private array (fuse at the same time)
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        auto gid = get_gid(y);
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z)
+            thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast<Data>(delta[gid]);
+        else
+            thread_scope[y] = 0;
+    }
+
+    /********************************************************************************
+     * partial-sum along y-axis, sequantially
+     ********************************************************************************/
+    for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1];
+
+    /********************************************************************************
+     * ND partial-sums along x- and z-axis
+     * in-warp shuffle used: in order to perform, it's transposed after X-partial sum
+     ********************************************************************************/
+    auto dist = 1;
+    Data addend;
+
+#pragma unroll
+    for (auto i = 0; i < BLOCK; i++) {
+        Data val = thread_scope[i];
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        // x-z transpose
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        for (dist = 1; dist < BLOCK; dist *= 2) {
+            addend = __shfl_up_sync(0xffffffff, val, dist, 8);
+            if (seg_tix >= dist) val += addend;
+        }
+
+        intermediate[TIZ][seg_id][seg_tix] = val;
+        __syncthreads();
+        val = intermediate[seg_tix][seg_id][TIZ];
+        __syncthreads();
+
+        thread_scope[i] = val;
+    }
+
+    /********************************************************************************
+     * write to DRAM
+     ********************************************************************************/
+#pragma unroll
+    for (auto y = 0; y < YSEQ; y++) {
+        if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; }
+    }
+    /* EOF */
+}
+
+}  // namespace experimental
+}  // namespace cusz
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */
diff --git a/qtensor/compression/cusz/src/kernel/detail/spline3.inl b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
new file mode 100644
index 00000000..5e3526bd
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/spline3.inl
@@ -0,0 +1,746 @@
+/**
+ * @file spline3.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-05-15
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_SPLINE3_CUH
+#define CUSZ_KERNEL_SPLINE3_CUH
+
+#include <stdint.h>
+#include <stdio.h>
+#include <type_traits>
+#include "utils/cuda_err.cuh"
+
+#define SPLINE3_COMPR true
+#define SPLINE3_DECOMPR false
+
+#if __cplusplus >= 201703L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#define TIX threadIdx.x
+#define TIY threadIdx.y
+#define TIZ threadIdx.z
+#define BIX blockIdx.x
+#define BIY blockIdx.y
+#define BIZ blockIdx.z
+#define BDX blockDim.x
+#define BDY blockDim.y
+#define BDZ blockDim.
+
+using DIM     = unsigned int;
+using STRIDE  = unsigned int;
+using DIM3    = dim3;
+using STRIDE3 = dim3;
+
+constexpr int BLOCK8  = 8;
+constexpr int BLOCK32 = 32;
+
+#define SHM_ERROR shm_errctrl
+
+namespace cusz {
+
+/********************************************************************************
+ * host API
+ ********************************************************************************/
+
+template <
+    typename TITER,
+    typename EITER,
+    typename FP            = float,
+    int  LINEAR_BLOCK_SIZE = 256,
+    bool PROBE_PRED_ERROR  = false>
+__global__ void c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error     = nullptr,
+    TITER   compress_error = nullptr);
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP           = float,
+    int LINEAR_BLOCK_SIZE = 256>
+__global__ void x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius);
+
+namespace device_api {
+/********************************************************************************
+ * device API
+ ********************************************************************************/
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    int  LINEAR_BLOCK_SIZE,
+    bool WORKFLOW         = SPLINE3_COMPR,
+    bool PROBE_PRED_ERROR = false>
+__device__ void spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius);
+}  // namespace device_api
+
+}  // namespace cusz
+
+/********************************************************************************
+ * helper function
+ ********************************************************************************/
+
+namespace {
+
+template <bool INCLUSIVE = true>
+__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z)
+{
+    if CONSTEXPR (INCLUSIVE) {  //
+        return x <= 32 and y <= 8 and z <= 8;
+    }
+    else {
+        return x < 32 and y < 8 and z < 8;
+    }
+}
+
+// control block_id3 in function call
+template <typename T, bool PRINT_FP = false, int XEND = 33, int YEND = 9, int ZEND = 9>
+__device__ void
+spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true)
+{
+    for (auto z = 0; z < ZEND; z++) {
+        printf("\nprint from GPU, z=%d\n", z);
+        printf("    ");
+        for (auto i = 0; i < 33; i++) printf("%3d", i);
+        printf("\n");
+
+        for (auto y = 0; y < YEND; y++) {
+            printf("y=%d ", y);
+            for (auto x = 0; x < XEND; x++) {  //
+                if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); }
+                else {
+                    T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x];
+                    if (compress) {
+                        if (c == 0) { printf("%3c", '.'); }
+                        else {
+                            if (abs(c) >= 10) { printf("%3c", '*'); }
+                            else {
+                                if (print_errctrl) { printf("%3d", c); }
+                                else {
+                                    printf("%4.2f", c);
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        if (print_errctrl) { printf("%3d", c); }
+                        else {
+                            printf("%4.2f", c);
+                        }
+                    }
+                }
+            }
+            printf("\n");
+        }
+    }
+    printf("\nGPU print end\n\n");
+}
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius)
+{
+    // alternatively, reinterprete cast volatile T?[][][] to 1D
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_data[z][y][x] = 0;
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius;
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+    __syncthreads();
+}
+
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap)
+{
+    auto x = (TIX % 32) + BIX * 32;
+    auto y = (TIX / 32) % 8 + BIY * 8;
+    auto z = (TIX / 32) / 8 + BIZ * 8;
+
+    bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0;
+    bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z;
+
+    if (pred1 and pred2) {
+        auto data_id      = x + y * data_leap.y + z * data_leap.z;
+        auto anchor_id    = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z;
+        anchor[anchor_id] = data[data_id];
+    }
+    __syncthreads();
+}
+
+/*
+ * use shmem, erroneous
+template <typename T1, int LINEAR_BLOCK_SIZE = 256>
+__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap)
+{
+    constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1;  // 11 iterations
+    for (auto i = 0; i < NUM_ITERS; i++) {
+        auto _tix = i * LINEAR_BLOCK_SIZE + TIX;
+
+        if (_tix < 33 * 9 * 9) {
+            auto x = (_tix % 33);
+            auto y = (_tix / 33) % 9;
+            auto z = (_tix / 33) / 9;
+
+            if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+                auto aid = ((x / 8) + BIX * 4) +             //
+                           ((y / 8) + BIY) * anchor_leap.y +  //
+                           ((z / 8) + BIZ) * anchor_leap.z;   //
+                anchor[aid] = shm_data[z][y][x];
+            }
+        }
+    }
+    __syncthreads();
+}
+*/
+
+template <typename T1, typename T2, int LINEAR_BLOCK_SIZE = 256>
+__device__ void x_reset_scratch_33x9x9data(
+    volatile T1 shm_xdata[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    T1*         anchor,       //
+    DIM3        anchor_size,  //
+    STRIDE3     anchor_leap)
+{
+    for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) {
+        auto x = (_tix % 33);
+        auto y = (_tix / 33) % 9;
+        auto z = (_tix / 33) / 9;
+
+        shm_errctrl[z][y][x] = 0;  // TODO explicitly handle zero-padding
+        /*****************************************************************************
+         okay to use
+         ******************************************************************************/
+        if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) {
+            shm_xdata[z][y][x] = 0;
+
+            auto ax = ((x / 8) + BIX * 4);
+            auto ay = ((y / 8) + BIY);
+            auto az = ((z / 8) + BIZ);
+
+            if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z)
+                shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z];
+        }
+        /*****************************************************************************
+         alternatively
+         ******************************************************************************/
+        // shm_errctrl[z][y][x] = radius;
+    }
+
+    __syncthreads();
+}
+
+template <typename Input, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33])
+{
+    constexpr auto TOTAL = 33 * 9 * 9;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 33);
+        auto y   = (_tix / 33) % 9;
+        auto z   = (_tix / 33) / 9;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid];
+    }
+    __syncthreads();
+}
+
+template <typename Output, int LINEAR_BLOCK_SIZE = 256>
+__device__ void
+shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap)
+{
+    constexpr auto TOTAL = 32 * 8 * 8;
+
+    for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+        auto x   = (_tix % 32);
+        auto y   = (_tix / 32) % 8;
+        auto z   = (_tix / 32) / 8;
+        auto gx  = (x + BIX * BLOCK32);
+        auto gy  = (y + BIY * BLOCK8);
+        auto gz  = (z + BIZ * BLOCK8);
+        auto gid = gx + gy * data_leap.y + gz * data_leap.z;
+
+        if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x];
+    }
+    __syncthreads();
+}
+
+template <
+    typename T1,
+    typename T2,
+    typename FP,
+    typename LAMBDAX,
+    typename LAMBDAY,
+    typename LAMBDAZ,
+    bool BLUE,
+    bool YELLOW,
+    bool HOLLOW,
+    int  LINEAR_BLOCK_SIZE,
+    int  BLOCK_DIMX,
+    int  BLOCK_DIMY,
+    bool COARSEN,
+    int  BLOCK_DIMZ,
+    bool BORDER_INCLUSIVE,
+    bool WORKFLOW>
+__forceinline__ __device__ void interpolate_stage(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    LAMBDAX     xmap,
+    LAMBDAY     ymap,
+    LAMBDAZ     zmap,
+    int         unit,
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized");
+    static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (1)");
+    static_assert((BLUE and YELLOW) == false, "must be only one hot (2)");
+    static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)");
+
+    auto run = [&](auto x, auto y, auto z) {
+        if (xyz33x9x9_predicate<BORDER_INCLUSIVE>(x, y, z)) {
+            T1 pred = 0;
+
+            if CONSTEXPR (BLUE) {  //
+                pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2;
+            }
+            if CONSTEXPR (YELLOW) {  //
+                pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2;
+            }
+            if CONSTEXPR (HOLLOW) {  //
+                pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2;
+            }
+
+            if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) {
+                auto          err = shm_data[z][y][x] - pred;
+                decltype(err) code;
+                // TODO unsafe, did not deal with the out-of-cap case
+                {
+                    code = fabs(err) * eb_r + 1;
+                    code = err < 0 ? -code : code;
+                    code = int(code / 2) + radius;
+                }
+                shm_errctrl[z][y][x] = code;  // TODO double check if unsigned type works
+                shm_data[z][y][x]    = pred + (code - radius) * ebx2;
+            }
+            else {  // TODO == DECOMPRESSS and static_assert
+                auto code         = shm_errctrl[z][y][x];
+                shm_data[z][y][x] = pred + (code - radius) * ebx2;
+            }
+        }
+    };
+    // -------------------------------------------------------------------------------- //
+
+    if CONSTEXPR (COARSEN) {
+        constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ;
+        for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) {
+            auto itix = (_tix % BLOCK_DIMX);
+            auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY;
+            auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY;
+            auto x    = xmap(itix, unit);
+            auto y    = ymap(itiy, unit);
+            auto z    = zmap(itiz, unit);
+            run(x, y, z);
+        }
+    }
+    else {
+        auto itix = (TIX % BLOCK_DIMX);
+        auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY;
+        auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY;
+        auto x    = xmap(itix, unit);
+        auto y    = ymap(itiy, unit);
+        auto z    = zmap(itiz, unit);
+        run(x, y, z);
+    }
+    __syncthreads();
+}
+
+}  // namespace
+
+/********************************************************************************/
+
+template <typename T1, typename T2, typename FP, int LINEAR_BLOCK_SIZE, bool WORKFLOW, bool PROBE_PRED_ERROR>
+__device__ void cusz::device_api::spline3d_layout2_interpolate(
+    volatile T1 shm_data[9][9][33],
+    volatile T2 shm_errctrl[9][9][33],
+    FP          eb_r,
+    FP          ebx2,
+    int         radius)
+{
+    auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); };
+    auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); };
+
+    auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); };
+    auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); };
+    auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); };
+    auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); };
+    auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); };
+
+    constexpr auto COARSEN          = true;
+    constexpr auto NO_COARSEN       = false;
+    constexpr auto BORDER_INCLUSIVE = true;
+    constexpr auto BORDER_EXCLUSIVE = false;
+
+    int unit = 4;
+
+    // iteration 1
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 2;
+
+    // iteration 2, TODO switch y-z order
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    unit = 1;
+
+    // iteration 3
+    interpolate_stage<
+        T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue),  //
+        true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius);
+    interpolate_stage<
+        T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow),  //
+        false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     test only: last step inclusive
+     ******************************************************************************/
+    // interpolate_stage<
+    //     T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+    //     false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>(
+    //     shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+    /******************************************************************************
+     production
+     ******************************************************************************/
+    interpolate_stage<
+        T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow),  //
+        false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>(
+        shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius);
+
+    /******************************************************************************
+     test only: print a block
+     ******************************************************************************/
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); }
+    // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); }
+}
+
+/********************************************************************************
+ * host API/kernel
+ ********************************************************************************/
+
+template <typename TITER, typename EITER, typename FP, int LINEAR_BLOCK_SIZE, bool PROBE_PRED_ERROR>
+__global__ void cusz::c_spline3d_infprecis_32x8x8data(
+    TITER   data,
+    DIM3    data_size,
+    STRIDE3 data_leap,
+    EITER   errctrl,
+    DIM3    errctrl_size,
+    STRIDE3 errctrl_leap,
+    TITER   anchor,
+    STRIDE3 anchor_leap,
+    FP      eb_r,
+    FP      ebx2,
+    int     radius,
+    TITER   pred_error,
+    TITER   compress_error)
+{
+    // compile time variables
+    using T = typename std::remove_pointer<TITER>::type;
+    using E = typename std::remove_pointer<EITER>::type;
+
+    if CONSTEXPR (PROBE_PRED_ERROR) {
+        // TODO
+    }
+    else {
+        __shared__ struct {
+            T data[9][9][33];
+            E errctrl[9][9][33];
+        } shmem;
+
+        c_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, radius);
+        global2shmem_33x9x9data<T, LINEAR_BLOCK_SIZE>(data, data_size, data_leap, shmem.data);
+
+        // version 1, use shmem, erroneous
+        // c_gather_anchor<T>(shmem.data, anchor, anchor_leap);
+        // version 2, use global mem, correct
+        c_gather_anchor<T>(data, data_size, data_leap, anchor, anchor_leap);
+
+        cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_COMPR, false>(
+            shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+        shmem2global_32x8x8data<E, LINEAR_BLOCK_SIZE>(shmem.errctrl, errctrl, errctrl_size, errctrl_leap);
+    }
+}
+
+template <
+    typename EITER,
+    typename TITER,
+    typename FP,
+    int LINEAR_BLOCK_SIZE>
+__global__ void cusz::x_spline3d_infprecis_32x8x8data(
+    EITER   errctrl,       // input 1
+    DIM3    errctrl_size,  //
+    STRIDE3 errctrl_leap,  //
+    TITER   anchor,        // input 2
+    DIM3    anchor_size,   //
+    STRIDE3 anchor_leap,   //
+    TITER   data,          // output
+    DIM3    data_size,     //
+    STRIDE3 data_leap,     //
+    FP      eb_r,
+    FP      ebx2,
+    int     radius)
+{
+    // compile time variables
+    using E = typename std::remove_pointer<EITER>::type;
+    using T = typename std::remove_pointer<TITER>::type;
+
+    __shared__ struct {
+        E errctrl[9][9][33];
+        T data[9][9][33];
+    } shmem;
+
+    x_reset_scratch_33x9x9data<T, E, LINEAR_BLOCK_SIZE>(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap);
+    global2shmem_33x9x9data<E, LINEAR_BLOCK_SIZE>(errctrl, errctrl_size, errctrl_leap, shmem.errctrl);
+    cusz::device_api::spline3d_layout2_interpolate<T, E, FP, LINEAR_BLOCK_SIZE, SPLINE3_DECOMPR, false>(
+        shmem.data, shmem.errctrl, eb_r, ebx2, radius);
+    shmem2global_32x8x8data<T, LINEAR_BLOCK_SIZE>(shmem.data, data, data_size, data_leap);
+}
+
+#undef TIX
+#undef TIY
+#undef TIZ
+#undef BIX
+#undef BIY
+#undef BIZ
+#undef BDX
+#undef BDY
+#undef BDZ
+
+template <typename T, typename E, typename FP, bool NO_R_SEPARATE>
+void launch_construct_Spline3(
+    T*           data,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    auto d = ndim();
+
+    if (d == 1) {  //
+        throw std::runtime_error("Spline1 not implemented");
+    }
+    else if (d == 2) {
+        throw std::runtime_error("Spline2 not implemented");
+    }
+    else if (d == 3) {
+        cusz::c_spline3d_infprecis_32x8x8data<T*, E*, float, 256, false>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                            //
+            (data, len3, leap3,                                           //
+             errctrl, ec_len3, ec_leap3,                                  //
+             anchor, an_leap3,                                            //
+             eb_r, ebx2, radius);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+template <typename T, typename E, typename FP>
+void launch_reconstruct_Spline3(
+    T*           xdata,
+    dim3 const   len3,
+    T*           anchor,
+    dim3 const   an_len3,
+    E*           errctrl,
+    dim3 const   ec_len3,
+    double const eb,
+    int const    radius,
+    float&       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    /*
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+     */
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto SEQ_3D    = dim3(1, 8, 1);
+    constexpr auto BLOCK_3D  = dim3(256, 1, 1);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    {
+        constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z;
+        constexpr auto SEQ_TOTAL    = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z;
+        constexpr auto BLOCK_TOTAL  = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z;
+
+        // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!");
+        if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!");
+    }
+
+    ////////////////////////////////////////
+
+    auto ebx2     = eb * 2;
+    auto eb_r     = 1 / eb;
+    auto leap3    = dim3(1, len3.x, len3.x * len3.y);
+    auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y);
+    auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    cusz::x_spline3d_infprecis_32x8x8data<E*, T*, float, 256>  //
+        <<<GRID_3D, BLOCK_3D, 0, stream>>>                     //
+        (errctrl, ec_len3, ec_leap3,                           //
+         anchor, an_len3, an_leap3,                            //
+         xdata, len3, leap3,                                   //
+         eb_r, ebx2, radius);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(&time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+}
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
new file mode 100644
index 00000000..15d10ade
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl
@@ -0,0 +1,1074 @@
+/**
+ * @file subroutine.inl
+ * @author Jiannan Tian
+ * @brief subroutines of kernels
+ * @version 0.4
+ * @date 2022-12-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <stdint.h>
+#include <type_traits>
+#include "cusz/pn.hh"
+#include "pipeline/compaction_g.inl"
+#include "subsub.inl"
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+//////// 1D
+
+namespace v0 {
+
+// compression load
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,
+    FP          ebx2_r);
+
+// decompression load
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ]);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+// compression and decompression store
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>
+__forceinline__ __device__ void write_1d(  //
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2);
+
+// compression pred-quant, method 1
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev = 0);
+
+// compression pred-quant, method 2
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev = 0);
+
+namespace compaction {
+
+template <
+    typename T,
+    typename EQ,
+    int  SEQ,
+    bool FIRST_POINT,
+    typename Compaction = CompactionDRAM<T>>
+__forceinline__ __device__ void predict_quantize_1d(  //
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_id_base,
+    Compaction   g_outlier,
+    T            prev = 0);
+
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev);
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void
+predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev);
+
+namespace compaction {
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,
+    int          radius,
+    uint32_t     g_idx_base,
+    Compaction   outlier,
+    T            prev);
+
+}
+
+namespace delta_only {
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void
+load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]);
+
+}
+
+}  // namespace v1_pn
+
+//////// 2D
+
+namespace v0 {
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void load_prequant_2d(
+    T*       data,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    FP       ebx2_r,
+    T        center[YSEQ + 1]);
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    T*       outlier);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}
+
+namespace compaction {
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    T          delta[YSEQ + 1],
+    uint32_t   dimx,
+    uint32_t   gix,
+    uint32_t   dimy,
+    uint32_t   giy_base,
+    uint32_t   stridey,
+    int        radius,
+    EQ*        quant,
+    Compaction outlier);
+
+};
+
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    int      radius,
+    T        private_buffer[YSEQ]);
+
+namespace delta_only {
+// decompression load
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    EQ*      quant,
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T        private_buffer[YSEQ]);
+
+}  // namespace delta_only
+
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void block_scan_2d(  //
+    T           thread_private[YSEQ],
+    volatile T* intermediate,
+    FP          ebx2);
+
+template <typename T, int YSEQ>
+__forceinline__ __device__ void decomp_write_2d(
+    T        thread_private[YSEQ],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    T*       xdata);
+
+}  // namespace v0
+
+namespace v1_pn {
+
+namespace compaction {
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+);
+
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+namespace delta_only {
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+);
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void quantize_write_2d(
+    T        delta[YSEQ + 1],
+    uint32_t dimx,
+    uint32_t gix,
+    uint32_t dimy,
+    uint32_t giy_base,
+    uint32_t stridey,
+    EQ*      quant);
+
+}  // namespace delta_only
+
+}  // namespace v1_pn
+
+//////// 3D
+
+namespace v0 {
+
+// TODO move subroutines for 3D here
+
+}
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 1D
+
+template <typename T, typename FP, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d(
+    T*          data,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ],
+    T&          prev,  // TODO use pointer?
+    FP          ebx2_r)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    int         radius,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + static_cast<T>(quant[id]) - radius;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d(
+    EQ*         quant,
+    T*          outlier,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = outlier[id] + PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = static_cast<T>(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int NTHREAD, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d(
+    EQ*         quant,
+    uint32_t    dimx,
+    uint32_t    id_base,
+    volatile T* shmem,
+    T           private_buffer[SEQ])
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto local_id = threadIdx.x + i * NTHREAD;
+        auto id       = id_base + local_id;
+        if (id < dimx) shmem[local_id] = PN<BYTEWIDTH>::decode(quant[id]);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i];
+    __syncthreads();
+}
+
+template <typename T1, typename T2, int NTHREAD, int SEQ, bool NO_OUTLIER>  // TODO remove NO_OUTLIER, use nullable
+__forceinline__ __device__ void psz::cuda::__device::v0::write_1d(
+    volatile T1* shmem_a1,
+    volatile T2* shmem_a2,
+    uint32_t     dimx,
+    uint32_t     id_base,
+    T1*          a1,
+    T2*          a2)
+{
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) {
+        auto id = id_base + threadIdx.x + i * NTHREAD;
+        if (id < dimx) {
+            if (NO_OUTLIER) {  //
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+            }
+            else {
+                a1[id] = shmem_a1[threadIdx.x + i * NTHREAD];
+                a2[id] = shmem_a2[threadIdx.x + i * NTHREAD];
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        shmem_quant[idx + threadIdx.x * SEQ] = static_cast<EQ>(cur - prev);
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d(
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    volatile T*  shmem_outlier,
+    int          radius,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        // otherwise, need to reset shared memory (to 0)
+        shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * static_cast<EQ>(candidate);
+        shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        T    candidate   = delta + radius;
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * static_cast<EQ>(candidate);
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = candidate;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d(
+    T            thp_buffer[SEQ],
+    volatile EQ* s_quant,
+    uint32_t     dimx,  // put x-related
+    int          radius,
+    uint32_t     g_idx_base,  // TODO this file `id_base` to `g_idx_base`
+    Compaction   outlier,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) {
+        T    delta       = cur - prev;
+        bool quantizable = fabs(delta) < radius;
+        UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+        auto inblock_idx = inloop_idx + threadIdx.x * SEQ;  // TODO this file use `inblock_idx`
+
+        // though quantizable, need to set non-quantizable position as 0
+        s_quant[inblock_idx] = quantizable * UI_delta;
+
+        // very small chance running into this block
+        if (not quantizable) {
+            auto g_idx = inblock_idx + g_idx_base;
+            if (g_idx < dimx) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.val[cur_idx] = delta;
+                outlier.idx[cur_idx] = g_idx;
+            }
+        }
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(thp_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i);
+        __syncthreads();  // TODO move __syncthreads() outside this subroutine?
+    }
+}
+
+// decompression pred-quant
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d(
+    T           private_buffer[SEQ],
+    T           ebx2,
+    volatile T* exchange_in,
+    volatile T* exchange_out,
+    volatile T* shmem_buffer)
+{
+    namespace wave32 = psz::cuda::__device::wave32;
+    wave32::intrawarp_inclusivescan_1d<T, SEQ>(private_buffer);
+    wave32::intrablock_exclusivescan_1d<T, SEQ, NTHREAD>(private_buffer, exchange_in, exchange_out);
+
+    // put back to shmem
+#pragma unroll
+    for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2;
+    __syncthreads();
+}
+
+// v1_pn: quantization code uses PN::encode
+template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d(  //
+    T            private_buffer[SEQ],
+    volatile EQ* shmem_quant,
+    T            prev)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+        UI UI_delta                          = PN<BYTEWIDTH>::encode(static_cast<I>(cur - prev));
+        shmem_quant[idx + threadIdx.x * SEQ] = UI_delta;
+    };
+
+    if (FIRST_POINT) {  // i == 0
+        quantize_1d(private_buffer[0], prev, 0);
+    }
+    else {
+#pragma unroll
+        for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+        __syncthreads();
+    }
+}
+
+// template <typename T, typename EQ, int SEQ, bool FIRST_POINT>
+// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d(
+//     T            private_buffer[SEQ],
+//     volatile EQ* shmem_quant,
+//     volatile T*  shmem_outlier,
+//     int          radius,
+//     T            prev)
+// {
+//     constexpr auto BYTEWIDTH = sizeof(EQ);
+//     using UI                 = EQ;
+//     using I                  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+//     auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) {
+//         T    delta       = cur - prev;
+//         bool quantizable = fabs(delta) < radius;
+//         UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta));
+
+//         // otherwise, need to reset shared memory (to 0)
+//         shmem_quant[idx + threadIdx.x * SEQ]   = quantizable * UI_delta;
+//         shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta;
+//     };
+
+//     if (FIRST_POINT) {  // i == 0
+//         quantize_1d(private_buffer[0], prev, 0);
+//     }
+//     else {
+// #pragma unroll
+//         for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i);
+//         __syncthreads();
+//     }
+// }
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 2D
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d(
+    // clang-format off
+    T*       data,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    FP ebx2_r,
+    T  center[YSEQ + 1]
+    // clang-format on
+)
+{
+    auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+    // use a warp as two half-warps
+    // block_dim = (16, 2, 1) makes a full warp internally
+
+#pragma unroll
+    for (auto iy = 0; iy < YSEQ; iy++) {
+        if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r);
+    }
+    auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32);  // same-warp, next-16
+    if (threadIdx.y == 1) center[0] = tmp;
+}
+
+template <typename T, typename FP, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1])
+{
+    /*
+       Lorenzo 2D (1-layer) illustration
+                 NW N NE
+       notation   W C E   "->" to predict
+       --------  SW S SE
+
+                normal data layout       |   considering register file
+                col(k-1)    col(k)       |   thread(k-1)        thread(k)
+                                         |
+       r(i-1)  -west[i-1]  +center[i-1]  |  -center(k-1)[i-1]  +center(k)[i-1]
+       r(i  )  +west[i]   ->center[i]    |  +center(k-1)[i]   ->center(k)[i]
+
+       calculation
+       -----------
+       delta = center[i] - (center[i-1] + west[i] - west[i-1])
+             = (center[i] - center[i-1]) - (west[i] - west[i-1])
+
+       With center[i] -= center[i-1] and west[i] -= west[i-1],
+       delta = center[i] - west[i]
+
+       For thread(k),
+       delta(k) = center(k)[i] - center(k-1)[i]
+                = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP)
+     */
+
+#pragma unroll
+    for (auto i = YSEQ; i > 0; i--) {
+        // with center[i-1] intact in this iteration
+        center[i] -= center[i - 1];
+        // within a halfwarp (32/2)
+        auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16);
+        if (threadIdx.x > 0) center[i] -= west;  // delta
+    }
+    __syncthreads();
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant, 
+    T*       outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // outlier array is not in sparse form in this version
+            quant[gid]   = quantizable * static_cast<EQ>(candidate);
+            outlier[gid] = (not quantizable) * candidate;
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast<EQ>(delta[i]);
+    }
+}
+
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx,  uint32_t gix,
+    uint32_t dimy,  uint32_t giy_base, uint32_t stridey,
+    EQ*      quant
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+        if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            T    candidate   = delta[i] + radius;
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * static_cast<EQ>(candidate);
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = candidate;
+            }
+        }
+    }
+}
+
+template <typename T, typename EQ, int YSEQ, typename Compaction>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d(
+    // clang-format off
+    T        delta[YSEQ + 1],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    EQ*      quant,
+    Compaction outlier
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 1; i < YSEQ + 1; i++) {
+        auto gid = get_gid(i - 1);
+
+        if (gix < dimx and giy_base + (i - 1) < dimy) {
+            bool quantizable = fabs(delta[i]) < radius;
+            UI   UI_delta    = PN<BYTEWIDTH>::encode(static_cast<I>(delta[i]));
+
+            // The non-quantizable is recorded as "0" (radius).
+            quant[gid] = quantizable * UI_delta;
+
+            if (not quantizable) {
+                auto cur_idx         = atomicAdd(outlier.count, 1);
+                outlier.idx[cur_idx] = gid;
+                outlier.val[cur_idx] = delta[i];
+            }
+        }
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + static_cast<T>(quant[gid]) - radius;  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d(
+    // clang-format off
+    EQ*      quant,
+    T*       outlier,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    int      radius,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = outlier[gid] + PN<BYTEWIDTH>::decode(quant[gid]);  // fuse
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = static_cast<T>(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// load to thread-private array (fuse at the same time)
+template <typename T, typename EQ, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d(
+    // clang-format off
+    EQ*      quant,
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T        thread_private[YSEQ]
+    // clang-format on
+)
+{
+    constexpr auto BYTEWIDTH = sizeof(EQ);
+
+    using UI = EQ;
+    using I  = typename psz::typing::Int<BYTEWIDTH>::T;
+
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously
+        if (gix < dimx and (giy_base + i) < dimy)
+            thread_private[i] = PN<BYTEWIDTH>::decode(quant[gid]);
+        else
+            thread_private[i] = 0;  // TODO set as init state?
+    }
+}
+
+// partial-sum along y-axis, sequantially
+// then, in-warp partial-sum along x-axis
+template <typename T, typename EQ, typename FP, int YSEQ>
+__forceinline__ __device__ void
+psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2)
+{
+    //       ------> gix (x)
+    //
+    //   |   t(0,0)       t(0,1)       t(0,2)       t(0,3)       ... t(0,f)
+    //   |
+    //   |   thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]  thp(0,0)[0]
+    //  giy  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]  thp(0,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]  thp(0,0)[7]
+    //
+    //   |   t(1,0)       t(1,1)       t(1,2)       t(1,3)       ... t(1,f)
+    //   |
+    //   |   thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]  thp(1,0)[0]
+    //  giy  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]  thp(1,0)[1]
+    //  (y)  |            |            |            |
+    //       thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]  thp(1,0)[7]
+
+    constexpr auto BLOCK = 16;
+
+    for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1];
+    // two-pass: store for cross-thread-private update
+    // TODO shuffle up by 16 in the same warp
+    if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1];
+    __syncthreads();
+    // broadcast the partial-sum result from a previous segment
+    if (threadIdx.y == 1) {
+        auto tmp = intermediate[threadIdx.x];
+#pragma unroll
+        for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp;  // regression as pointer
+    }
+    // implicit sync as there is half-warp divergence
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        for (auto d = 1; d < BLOCK; d *= 2) {
+            T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16);  // half-warp shuffle
+            if (threadIdx.x >= d) thread_private[i] += n;
+        }
+        thread_private[i] *= ebx2;  // scale accordingly
+    }
+}
+
+// write to DRAM
+template <typename T, int YSEQ>
+__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d(
+    // clang-format off
+    T        thread_private[YSEQ],
+    uint32_t dimx, uint32_t gix,
+    uint32_t dimy, uint32_t giy_base, uint32_t stridey,
+    T*       xdata
+    // clang-format on
+)
+{
+    auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; };
+
+#pragma unroll
+    for (auto i = 0; i < YSEQ; i++) {
+        auto gid = get_gid(i);
+        if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//////// 3D
diff --git a/qtensor/compression/cusz/src/kernel/detail/subsub.inl b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
new file mode 100644
index 00000000..e8da624f
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/detail/subsub.inl
@@ -0,0 +1,92 @@
+/**
+ * @file subsub.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2022-12-26
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+namespace psz {
+namespace cuda {
+namespace __device {
+
+namespace wave32 {
+template <typename T, int SEQ>
+__forceinline__ __device__ void intrawarp_inclusivescan_1d(  //
+    T private_buffer[SEQ]);
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void intrablock_exclusivescan_1d(  //
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out);
+}  // namespace wave32
+
+}  // namespace __device
+}  // namespace cuda
+}  // namespace psz
+
+template <typename T, int SEQ>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ])
+{
+    for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1];
+    T addend = private_buffer[SEQ - 1];
+
+    // in-warp shuffle
+    for (auto d = 1; d < 32; d *= 2) {
+        T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+        if (threadIdx.x % 32 >= d) addend += n;
+    }
+    // exclusive scan
+    T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32);
+
+    // propagate
+    if (threadIdx.x % 32 > 0)
+        for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend;
+}
+
+template <typename T, int SEQ, int NTHREAD>
+__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d(
+    T           private_buffer[SEQ],
+    volatile T* exchange_in,
+    volatile T* exchange_out)
+{
+    constexpr auto NWARP = NTHREAD / 32;
+    static_assert(NWARP <= 32, "too big");
+
+    auto warp_id = threadIdx.x / 32;
+    auto lane_id = threadIdx.x % 32;
+
+    if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1];
+    __syncthreads();
+
+    if (NWARP <= 8) {
+        if (threadIdx.x == 0) {
+            exchange_out[0] = 0;
+            for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1];
+        }
+    }
+    else if (NWARP <= 32) {
+        if (threadIdx.x <= 32) {
+            auto addend = exchange_in[threadIdx.x];
+
+            for (auto d = 1; d < 32; d *= 2) {
+                T n = __shfl_up_sync(0xffffffff, addend, d, 32);
+                if (threadIdx.x >= d) addend += n;
+            }
+            // exclusive scan
+            T prev_addend         = __shfl_up_sync(0xffffffff, addend, 1, 32);
+            exchange_out[warp_id] = (warp_id > 0) * prev_addend;
+        }
+    }
+    // else-case handled by static_assert
+    __syncthreads();
+
+    // propagate
+    auto addend = exchange_out[warp_id];
+    for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend;
+    __syncthreads();
+};
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo.cu b/qtensor/compression/cusz/src/kernel/lorenzo.cu
new file mode 100644
index 00000000..ff46e548
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo.cu
@@ -0,0 +1,209 @@
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-01
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+
+// #include "detail/lorenzo.inl"
+#include "detail/lorenzo23.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_i(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T* const     outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    // constexpr auto BLOCK_3D = dim3(32, 1, 8);  // for v0
+    constexpr auto BLOCK_3D = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+
+        psz::cuda::__kernel::v0::c_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(data, eq, outlier, len3, leap3, radius, ebx2_r);
+        psz::cuda::__kernel::v0::c_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_i(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D  = dim3(256 / 8, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    auto d = ndim();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        //::cusz::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+        //<<<GRID_1D, BLOCK_1D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_1d1l<T, EQ, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2<T, EQ, FP>
+        //<<<GRID_2D, BLOCK_2D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_2d1l<T, EQ, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, EQ, FP>
+        //<<<GRID_3D, BLOCK_3D, 0, stream>>>(outlier, eq, xdata, len3, leap3, radius, ebx2);
+        psz::cuda::__kernel::v0::x_lorenzo_3d1l<T, EQ, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ)                                                                     \
+    template cusz_error_status compress_predict_lorenzo_i<T, EQ>(                                                  \
+        T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier,         \
+        uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream);                  \
+                                                                                                                   \
+    template cusz_error_status decompress_predict_lorenzo_i<T, EQ>(                                                \
+        EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \
+        int const radius, T* xdata, float* time_elapsed, cudaStream_t stream);
+
+// before 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float);
+
+// 2023
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
new file mode 100644
index 00000000..061aebb4
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu
@@ -0,0 +1,176 @@
+/**
+ * @file claunch_cuda_proto.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-22
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_proto.inl"
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status compress_predict_lorenzo_iproto(
+    T* const     data,
+    dim3 const   len3,
+    double const eb,
+    int const    radius,
+    EQ* const    eq,
+    T*           outlier,
+    uint32_t*    outlier_idx,
+    uint32_t*    num_outliers,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        c_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 2) {
+        c_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (ndim() == 3) {
+        c_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status decompress_predict_lorenzo_iproto(
+    EQ*            eq,
+    dim3 const     len3,
+    T*             outlier,
+    uint32_t*      outlier_idx,
+    uint32_t const num_outliers,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed,
+    cudaStream_t   stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto BLOCK_1D  = dim3(256, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 16, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(8, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(8, 8, 8);
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    using namespace psz::cuda::__kernel::prototype;
+
+    if (ndim() == 1) {
+        x_lorenzo_1d1l<T, EQ, FP><<<GRID_1D, BLOCK_1D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 2) {
+        x_lorenzo_2d1l<T, EQ, FP><<<GRID_2D, BLOCK_2D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (ndim() == 3) {
+        x_lorenzo_3d1l<T, EQ, FP><<<GRID_3D, BLOCK_3D, 0, stream>>>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                         \
+    template cusz_error_status compress_predict_lorenzo_iproto<T, EQ, FP>(                                \
+        T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \
+        cudaStream_t);                                                                                    \
+                                                                                                          \
+    template cusz_error_status decompress_predict_lorenzo_iproto<T, EQ, FP>(                              \
+        EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
new file mode 100644
index 00000000..0ef5b9f5
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc
@@ -0,0 +1,118 @@
+/**
+ * @file lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-03-16
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "detail/lorenzo_serial.inl"
+#include "cusz/type.h"
+
+template <typename T, typename EQ, typename FP, typename OUTLIER = psz_outlier_serial<T>>
+cusz_error_status serial_compress_predict_lorenzo_i(
+    T* const       data,
+    psz_dim3 const len3,
+    double const   eb,
+    int const      radius,
+    EQ* const      eq,
+    OUTLIER*       outlier,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    if (d == 1) {
+        psz::serial::__kernel::c_lorenzo_1d1l<T, EQ, FP, 256>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::c_lorenzo_2d1l<T, EQ, FP, 16>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::c_lorenzo_3d1l<T, EQ, FP, 8>(data, len3, leap3, radius, ebx2_r, eq, outlier);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename EQ, typename FP>
+cusz_error_status serial_decompress_predict_lorenzo_i(
+    EQ*            eq,
+    psz_dim3 const len3,
+    T*             outlier,
+    double const   eb,
+    int const      radius,
+    T*             xdata,
+    float*         time_elapsed)
+{
+    auto divide3 = [](psz_dim3 len, psz_dim3 sublen) {
+        return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1};
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = psz_dim3{1, len3.x, len3.x * len3.y};
+
+    auto d = ndim();
+
+    if (d == 1) {
+        psz::serial::__kernel::x_lorenzo_1d1l<T, EQ, FP, 256>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 2) {
+        psz::serial::__kernel::x_lorenzo_2d1l<T, EQ, FP, 16>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+    else if (d == 3) {
+        psz::serial::__kernel::x_lorenzo_3d1l<T, EQ, FP, 8>(eq, outlier, len3, leap3, radius, ebx2, xdata);
+    }
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP)                      \
+    template cusz_error_status serial_compress_predict_lorenzo_i<T, EQ, FP>(                           \
+        T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial<T>*, float*); \
+                                                                                                       \
+    template cusz_error_status serial_decompress_predict_lorenzo_i<T, EQ, FP>(                         \
+        EQ*, psz_dim3 const, T*, double const, int const, T*, float*);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
new file mode 100644
index 00000000..12773d35
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu
@@ -0,0 +1,206 @@
+/**
+ * @file lorenzo_var.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-27
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.h"
+#include "kernel/lorenzo_all.hh"
+
+#include "detail/lorenzo_var.inl"
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::compress_predict_lorenzo_ivar(
+    T*           data,
+    dim3 const   len3,
+    double const eb,
+    DeltaT*      delta,
+    bool*        signum,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::c_lorenzo_1d1l<T, DeltaT, FP, SEQ_1D, SEQ_1D>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                             //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else if (ndim() == 3) {
+        cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (data, delta, signum, len3, leap3, ebx2_r);
+    }
+    else {
+        throw std::runtime_error("Lorenzo only works for 123-D.");
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+template <typename T, typename DeltaT, typename FP>
+cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar(
+    DeltaT*      delta,
+    bool*        signum,
+    dim3 const   len3,
+    double const eb,
+    T*           xdata,
+    float*       time_elapsed,
+    cudaStream_t stream)
+{
+    auto pardeg3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    // constexpr auto SEQ_1D    = 8;  // x-sequentiality == 8
+    constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1);
+    auto           GRID_1D  = pardeg3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    // constexpr auto SEQ_2D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_2D = dim3(16, 2, 1);
+    auto           GRID_2D  = pardeg3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    // constexpr auto SEQ_3D    = dim3(1, 8, 1);  // y-sequentiality == 8
+    constexpr auto BLOCK_3D = dim3(32, 1, 8);
+    auto           GRID_3D  = pardeg3(len3, SUBLEN_3D);
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (ndim() == 1) {
+        cusz::experimental::x_lorenzo_1d1l<T, DeltaT, FP, 256, 8>  //
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>                     //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else if (ndim() == 2) {
+        cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2<T, DeltaT, FP>  //
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>                                 //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+    else {
+        cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8<T, DeltaT, FP>  //
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>                                    //
+            (signum, delta, xdata, len3, leap3, ebx2);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                                      \
+    template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                            \
+        T*, dim3 const, double const, E*, bool*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                          \
+        E*, bool*, dim3 const, double const, T*, float*, cudaStream_t);                                               \
+                                                                                                                      \
+    cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                        \
+        T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed,                 \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        asz::experimental::compress_predict_lorenzo_ivar<T, E, FP>(                                                   \
+            data, len3, eb, delta, signum, time_elapsed, stream);                                                     \
+        return CUSZ_SUCCESS;                                                                                          \
+    }                                                                                                                 \
+                                                                                                                      \
+    cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral(                      \
+        E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \
+    {                                                                                                                 \
+        asz::experimental::decompress_predict_lorenzo_ivar<T, E, FP>(                                                 \
+            delta, signum, len3, eb, xdata, time_elapsed, stream);                                                    \
+        return CUSZ_SUCCESS;                                                                                          \
+    }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/kernel/preprocess.cuh b/qtensor/compression/cusz/src/kernel/preprocess.cuh
new file mode 100644
index 00000000..f082c193
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/preprocess.cuh
@@ -0,0 +1,65 @@
+/**
+ * @file preprocess.cuh
+ * @author Jiannan Tian
+ * @brief Filters for preprocessing of cuSZ.
+ * @version 0.3
+ * @date 2020-09-20
+ * (created) 2020-05-03 (rev) 2021-06-21
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#ifndef CUSZ_KERNEL_PREPROCESS_CUH
+#define CUSZ_KERNEL_PREPROCESS_CUH
+
+#include <iostream>
+
+#include "common.hh"
+
+using std::cout;
+using std::endl;
+
+namespace cusz {
+
+#include <numeric>
+
+template <typename T>
+__global__ void log_transform()
+{
+    static_assert(std::is_floating_point<T>::value, "[log_transform] must be floating-point type.");
+}
+
+template <typename Data, int DOWNSCALE_FACTOR, int tBLK>
+__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1)
+{
+    auto y   = threadIdx.y;
+    auto x   = threadIdx.x;
+    auto yid = blockIdx.y * blockDim.y + y;
+    auto xid = blockIdx.x * blockDim.x + x;
+
+    __shared__ Data s[tBLK][tBLK];
+
+    if (yid >= new_d1 or xid >= new_d0) return;
+
+    int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR;
+    s[y][x]  = 0;
+
+    for (int j = 0; j < yblk; j++)
+        for (int i = 0; i < xblk; i++)
+            s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)];
+
+    output[yid * new_d0 + xid] = s[y][x] / static_cast<Data>(yblk * xblk);
+}
+}  // namespace cusz
+
+template __global__ void cusz::binning2d<float, 2, 32>(float*, float*, size_t, size_t, size_t, size_t);
+template __global__ void cusz::binning2d<double, 2, 32>(double*, double*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I1, 2, 32>(I1*, I1*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I2, 2, 32>(I2*, I2*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I4, 2, 32>(I4*, I4*, size_t, size_t, size_t, size_t);
+// template __global__ void cusz::binning2d<I8, 2, 32>(I8*, I8*, size_t, size_t, size_t, size_t);
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/rle.cuh b/qtensor/compression/cusz/src/kernel/rle.cuh
new file mode 100644
index 00000000..f8fe36ed
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/rle.cuh
@@ -0,0 +1,74 @@
+// modified from thrust example
+// attach the license below when push to master branch
+// https://github.com/NVIDIA/thrust/blob/main/LICENSE
+
+/**
+ * @file rle.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2021-04-01
+ *
+ * (C) 2021 by Washington State University, Argonne National Laboratory
+ *
+ */
+
+#ifndef KERNEL_RLE_CUH
+#define KERNEL_RLE_CUH
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <iostream>
+#include <iterator>
+
+using const_gen = thrust::constant_iterator<int>;
+using counter   = thrust::counting_iterator<int>;
+
+namespace kernel {
+
+template <typename T>
+void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs)
+{
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+    // compute the output size (run lengths)
+    num_runs = thrust::reduce_by_key(
+                   input, input + N,  // input::key (symbol)
+                   const_gen(1),      // input::value (count)
+                   output,            // output::key (symbol)
+                   lengths)           // output::value (count)
+                   .first -
+               output;
+}
+
+template <typename T>
+void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs)
+{
+    thrust::device_ptr<T>   output  = thrust::device_pointer_cast(d_fullfmt_data);
+    thrust::device_ptr<T>   input   = thrust::device_pointer_cast(d_compact_data);
+    thrust::device_ptr<int> lengths = thrust::device_pointer_cast(d_lengths);
+
+    // scan the lengths
+    thrust::inclusive_scan(lengths, lengths + num_runs, lengths);
+
+    // compute input index for each output element
+    thrust::device_vector<int> indices(N);
+    thrust::lower_bound(
+        lengths, lengths + N,        //
+        counter(1), counter(N + 1),  //
+        indices.begin());
+
+    thrust::encode(indices.begin(), indices.end(), input, output);
+}
+
+}  // namespace kernel
+
+#endif
diff --git a/qtensor/compression/cusz/src/kernel/spv_gpu.cu b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
new file mode 100644
index 00000000..29bcee1c
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/spv_gpu.cu
@@ -0,0 +1,60 @@
+/**
+ * @file spv_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-29
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/spv_gpu.inl"
+#include "kernel/spv_gpu.h"
+#include "kernel/spv_gpu.hh"
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                              \
+    void spv_gather_T##Tliteral##_M##Mliteral(                                                                     \
+        T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                              \
+        psz::detail::spv_gather<T, M>(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                        \
+    }                                                                                                              \
+                                                                                                                   \
+    void spv_scatter_T##Tliteral##_M##Mliteral(                                                                    \
+        T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)            \
+    {                                                                                                              \
+        psz::detail::spv_scatter<T, M>(d_val, d_idx, nnz, decoded, milliseconds, stream);                          \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
+
+#define SPV(Tliteral, Mliteral, T, M)                                                                               \
+    template <>                                                                                                     \
+    void psz::spv_gather<T, M>(                                                                                     \
+        T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \
+    {                                                                                                               \
+        spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream);                  \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    void psz::spv_scatter<T, M>(                                                                                    \
+        T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream)           \
+    {                                                                                                               \
+        spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream);                    \
+    }
+
+SPV(ui8, ui32, uint8_t, uint32_t)
+SPV(ui16, ui32, uint16_t, uint32_t)
+SPV(ui32, ui32, uint32_t, uint32_t)
+SPV(ui64, ui32, uint64_t, uint32_t)
+SPV(fp32, ui32, float, uint32_t)
+SPV(fp64, ui32, double, uint32_t)
+
+#undef SPV
diff --git a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
new file mode 100644
index 00000000..b7263613
--- /dev/null
+++ b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu
@@ -0,0 +1,118 @@
+/**
+ * @file v2_lorenzo.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "cusz/type.h"
+#include "utils/cuda_err.cuh"
+#include "utils/timer.h"
+
+#include "kernel/lorenzo_all.hh"
+#include "kernel/v2_lorenzo.hh"
+
+template <typename T, typename E, typename FP>
+cusz_error_status v2_compress_predict_lorenzo_i(
+    T* const          data,
+    dim3 const        len3,
+    double const      eb,
+    int const         radius,
+    E* const          errctrl,
+    dim3 const        placeholder_2,
+    T* const          anchor,
+    dim3 const        placeholder_1,
+    CompactionDRAM<T> outlier,
+    float*            time_elapsed,
+    cudaStream_t      stream)
+{
+    auto divide3 = [](dim3 len, dim3 sublen) {
+        return dim3(
+            (len.x - 1) / sublen.x + 1,  //
+            (len.y - 1) / sublen.y + 1,  //
+            (len.z - 1) / sublen.z + 1);
+    };
+
+    auto ndim = [&]() {
+        if (len3.z == 1 and len3.y == 1)
+            return 1;
+        else if (len3.z == 1 and len3.y != 1)
+            return 2;
+        else
+            return 3;
+    };
+
+    constexpr auto SUBLEN_1D = 256;
+    constexpr auto SEQ_1D    = 4;  // x-sequentiality == 4
+    constexpr auto BLOCK_1D  = dim3(256 / 4, 1, 1);
+    auto           GRID_1D   = divide3(len3, SUBLEN_1D);
+
+    constexpr auto SUBLEN_2D = dim3(16, 16, 1);
+    constexpr auto BLOCK_2D  = dim3(16, 2, 1);
+    auto           GRID_2D   = divide3(len3, SUBLEN_2D);
+
+    constexpr auto SUBLEN_3D = dim3(32, 8, 8);
+    constexpr auto BLOCK_3D  = dim3(32, 8, 1);  // for v0::r1_shfl
+    auto           GRID_3D   = divide3(len3, SUBLEN_3D);
+
+    auto d = ndim();
+
+    // error bound
+    auto ebx2   = eb * 2;
+    auto ebx2_r = 1 / ebx2;
+    auto leap3  = dim3(1, len3.x, len3.x * len3.y);
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    if (d == 1) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l<T, E, FP, SUBLEN_1D, SEQ_1D>
+            <<<GRID_1D, BLOCK_1D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 2) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l<T, E, FP>
+            <<<GRID_2D, BLOCK_2D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+    else if (d == 3) {
+        psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l<T, E, FP>
+            <<<GRID_3D, BLOCK_3D, 0, stream>>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier);
+    }
+
+    STOP_CUDAEVENT_RECORDING(stream);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    TIME_ELAPSED_CUDAEVENT(time_elapsed);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP)                   \
+    template cusz_error_status v2_compress_predict_lorenzo_i<T, E, FP>(                            \
+        T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \
+        struct CompactionDRAM<T>, float*, cudaStream_t);                                           \
+                                                                                                   \
+    // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral(                \
+    //     T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl,          \
+    //     dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed,         \
+    //     cudaStream_t stream)                                                                                  \
+    // {                                                                                                         \
+    //     return v2_compress_predict_lorenzo_i<T, E, FP>(                                                       \
+    //         data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \
+    //         time_elapsed, stream);                                                                            \
+    // }
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float);
+
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double);
+CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double);
+
+#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
new file mode 100644
index 00000000..a9449447
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc
@@ -0,0 +1,112 @@
+/**
+ * @file v2_compressor.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-29
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "pipeline/v2_compressor.hh"
+#include "common/configs.hh"
+#include "framework.hh"
+
+namespace psz {
+
+template <class B>
+v2_Compressor<B>::~v2_Compressor()
+{
+    pimpl.reset();
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor() : pimpl{std::make_unique<impl>()}
+{
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(const v2_Compressor<B>& old) : pimpl{std::make_unique<impl>(*old.pimpl)}
+{
+}
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(const v2_Compressor<B>& old)
+{
+    *pimpl = *old.pimpl;
+    return *this;
+}
+
+template <class B>
+v2_Compressor<B>::v2_Compressor(v2_Compressor<B>&&) = default;
+
+template <class B>
+v2_Compressor<B>& v2_Compressor<B>::operator=(v2_Compressor<B>&&) = default;
+
+//------------------------------------------------------------------------------
+
+template <class B>
+void v2_Compressor<B>::init(Context* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::init(v2_header* config)
+{
+    pimpl->init(config);
+}
+
+template <class B>
+void v2_Compressor<B>::compress(
+    Context*             config,
+    v2_Compressor<B>::T* uncompressed,
+    BYTE*&               compressed,
+    size_t&              compressed_len,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print);
+}
+
+template <class B>
+void v2_Compressor<B>::decompress(
+    v2_header*           config,
+    BYTE*                compressed,
+    v2_Compressor<B>::T* decompressed,
+    cudaStream_t         stream,
+    bool                 dbg_print)
+{
+    pimpl->decompress(config, compressed, decompressed, stream, dbg_print);
+}
+
+// template <class B>
+// void v2_Compressor<B>::clear_buffer()
+// {
+//     pimpl->clear_buffer();
+// }
+
+// getter
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header& header)
+{
+    pimpl->export_header(header);
+}
+
+template <class B>
+void v2_Compressor<B>::export_header(v2_header* header)
+{
+    pimpl->export_header(header);
+}
+
+// template <class B>
+// void v2_Compressor<B>::export_timerecord(TimeRecord* ext_timerecord)
+// {
+//     pimpl->export_timerecord(ext_timerecord);
+// }
+
+}  // namespace psz
+
+template class psz::v2_Compressor<cusz::Framework<float>>;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
new file mode 100644
index 00000000..0fcc6ebc
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu
@@ -0,0 +1,15 @@
+/**
+ * @file v2_compressor_impl.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "framework.hh"
+#include "v2_compressor_impl.inl"
+
+template class psz::v2_Compressor<cusz::Framework<float>>::impl;
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
new file mode 100644
index 00000000..0dd96f91
--- /dev/null
+++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl
@@ -0,0 +1,239 @@
+/**
+ * @file v2_compressor_impl.inl
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.4
+ * @date 2023-01-23
+ *
+ * (C) 2023 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D
+
+#include <iostream>
+
+#include "component.hh"
+#include "header.h"
+#include "pipeline/v2_compressor.hh"
+// #include "kernel/cpplaunch_cuda.hh"
+#include "kernel/v2_lorenzo.hh"
+#include "stat/stat_g.hh"
+#include "utils/cuda_err.cuh"
+
+#include "../detail/spv_gpu.inl"
+#include "../kernel/detail/lorenzo23.inl"
+
+#define TEMPLATE_TYPE template <class CONFIG>
+#define IMPL v2_Compressor<CONFIG>::impl
+
+#define ARCHIVE(VAR, FIELD)                                                                                  \
+    if (segments[v2_header::FIELD] != 0 and VAR != nullptr) {                                                \
+        auto dst = var_archive() + header.entry[v2_header::FIELD];                                           \
+        auto src = reinterpret_cast<BYTE*>(VAR);                                                             \
+        CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \
+    }
+
+#define ACCESS_VAR(SYM, TYPE) reinterpret_cast<TYPE*>(in_compressed + header->entry[v2_header::SYM])
+
+namespace psz {
+
+TEMPLATE_TYPE
+IMPL::impl()
+{
+    codec = new Codec;
+    // TODO re-enable fallback codec
+    // fb_codec  = new FallbackCodec;
+}
+
+TEMPLATE_TYPE
+void IMPL::destroy()
+{
+    if (codec) delete codec;
+    // if (fb_codec) delete codec;
+
+    // also deallocate buffer
+}
+
+TEMPLATE_TYPE
+void IMPL::init(Context* config) { __init(config); }
+
+TEMPLATE_TYPE
+void IMPL::init(v2_header* config) { __init(config); }
+
+TEMPLATE_TYPE
+template <class ContextOrHeader>
+void IMPL::__init(ContextOrHeader* c)
+{
+    static_assert(
+        std::is_same<ContextOrHeader, Context>::value or  //
+            std::is_same<ContextOrHeader, v2_header>::value,
+        "[v2_Compressor::impl::init] not a valid comrpessor config type.");
+
+    auto len = c->x * c->y * c->z;
+    // TODO allocate anchor
+
+    // allocate eq
+    cudaMalloc(&d_errctrl, len * sizeof(EQ));  // to overlap with one of vle/hf buffers
+
+    // allocate outlier
+    outlier.allocate(len / sp_factor, true);
+
+    // allocate vle/hf
+    codec->init(len, c->radius * 2, c->vle_pardeg);
+    // TODO disable fallback codec for now
+}
+
+TEMPLATE_TYPE
+void IMPL::compress(
+    Context*     c,
+    T*           uncompressed,
+    BYTE*&       compressed,
+    size_t&      compressed_len,
+    cudaStream_t stream,
+    bool         dbg_print)
+{
+    auto const eb     = c->eb;
+    auto const radius = c->radius;
+    auto const pardeg = c->vle_pardeg;
+
+    if (dbg_print) {
+        printf("[dbg] eb: %lf\n", eb);
+        printf("[dbg] radius: %d\n", radius);
+        printf("[dbg] pardeg: %d\n", pardeg);
+        // printf("[dbg] codecs_in_use: %d\n", codecs_in_use);
+        printf("[dbg] sp_factor: %d\n", sp_factor);
+    }
+
+    data_len3 = dim3(c->x, c->y, c->z);
+    data_len  = c->x * c->y * c->z;
+
+    header.sp.factor = sp_factor;
+
+    BYTE*  d_codec_out{nullptr};
+    size_t codec_outlen{0};
+
+    // size_t sublen;
+    auto booklen = radius * 2;
+
+    /******************************************************************************/
+
+    // TODO version clarification
+    // with compaction
+    v2_compress_predict_lorenzo_i<T, EQ, FP>(
+        uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier,
+        &comp_time.construct, stream);
+
+    outlier.make_count_host_accessible(stream);
+
+    asz::stat::histogram<E>(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // TODO overlapping memory
+    codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // update header
+    {
+        header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1;
+        header.sp.count = outlier.access_count_on_host();
+        // TODO the new
+        {
+            // header.config.radius = radius, header.config.eb = eb;
+            // header.hf.pardeg = pardeg;
+        }
+
+        // the compat
+        {
+            header.radius = radius, header.eb = eb;
+            header.vle_pardeg = pardeg;
+        }
+
+        // header.byte_vle  = 4;  // regardless of fallback codec
+    };
+
+    size_t segments[v2_header::END] = {0};
+
+    // gather archive
+    {
+        // calculate offsets
+        segments[v2_header::HEADER] = sizeof(v2_header);
+        segments[v2_header::ANCHOR] = 0;  // placeholder
+        segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX);
+        segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T);
+        segments[v2_header::HF]     = codec_outlen;
+
+        header.entry[0] = 0;
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; }
+        for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; }
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+
+        // memcpy
+        ARCHIVE(d_anchor, ANCHOR);
+        ARCHIVE(outlier.idx, SP_IDX);
+        ARCHIVE(outlier.val, SP_VAL);
+        ARCHIVE(d_codec_out, HF);
+
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    // output
+    compressed_len = header.entry[v2_header::END];
+    compressed     = var_archive();
+
+    // collect_compress_timerecord();
+}
+
+TEMPLATE_TYPE
+void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print)
+{
+    // TODO host having copy of header when compressing
+    if (not header) {
+        header = new v2_header;
+        CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream));
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+    }
+
+    data_len3 = dim3(header->x, header->y, header->z);
+
+    // use_fallback_codec      = header->byte_vle == 8;
+    // auto const vle_pardeg = header->hf.pardeg;
+
+    // The inputs of components are from `compressed`.
+    // auto d_anchor = ACCESS_VAR(ANCHOR, T);
+    auto d_vle   = ACCESS_VAR(HF, BYTE);
+    auto d_spidx = ACCESS_VAR(SP_IDX, IDX);
+    auto d_spval = ACCESS_VAR(SP_VAL, T);
+
+    // wire and aliasing
+    auto d_outlier = out_decompressed;
+    auto d_xdata   = out_decompressed;
+
+    psz::detail::spv_scatter<T, IDX>(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream);
+
+    codec->decode(d_vle, d_errctrl);
+
+    decompress_predict_lorenzo_i<T, EQ, FP>(
+        d_errctrl, data_len3,  //
+        d_outlier,             //
+        nullptr, 0,            // TODO remove
+        header->eb, header->radius,
+        d_xdata,  // output
+        &decomp_time.reconstruct, stream);
+
+    // collect_decompress_timerecord();
+
+    // clear state for the next decompression after reporting
+    // use_fallback_codec = false;
+}
+
+}  // namespace psz
+
+#undef TEMPLATE_TYPE
+#undef IMPL
+
+#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_1.cu b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
new file mode 100644
index 00000000..a32a02eb
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_1.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 1)
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui8, uint8_t)
+
+#undef THRUSTGPU_DESCRIPTION
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_2.cu b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
new file mode 100644
index 00000000..b85c6477
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_2.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui16, uint16_t)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_3.cu b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
new file mode 100644
index 00000000..a68f760c
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_3.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(ui32, uint32_t)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_4.cu b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
new file mode 100644
index 00000000..47dcc774
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_4.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp32, float)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg1_5.cu b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
new file mode 100644
index 00000000..5828860d
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg1_5.cu
@@ -0,0 +1,30 @@
+/**
+ * @file cmpg1_5.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_DESCRIPTION(Tliteral, T)                                        \
+    void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \
+    {                                                                             \
+        psz::detail::thrustgpu_get_extrema_rawptr<T>(d_ptr, len, res);            \
+    }                                                                             \
+                                                                                  \
+    template <>                                                                   \
+    void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4])        \
+    {                                                                             \
+        thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res);                \
+    }
+
+THRUSTGPU_DESCRIPTION(fp64, double)
+
+#undef THRUSTGPU_DESCRIPTION
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg2.cu b/qtensor/compression/cusz/src/stat/cmpg2.cu
new file mode 100644
index 00000000..a8bdcd29
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg2.cu
@@ -0,0 +1,34 @@
+/**
+ * @file cmp2g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 2)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                    \
+        return psz::detail::thrustgpu_identical<T>(d1, d2, len);         \
+    }                                                                    \
+                                                                         \
+    template <>                                                          \
+    bool psz::thrustgpu_identical<T>(T * d1, T * d2, size_t const len)   \
+    {                                                                    \
+        return thrustgpu_identical_T##Tliteral(d1, d2, len);             \
+    }
+
+THRUSTGPU_COMPARE_LOSSLESS(fp32, float)
+THRUSTGPU_COMPARE_LOSSLESS(fp64, double)
+THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t)
+THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t)
+
+#undef THRUSTGPU_COMPARE_LOSSLESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg3.cu b/qtensor/compression/cusz/src/stat/cmpg3.cu
new file mode 100644
index 00000000..61f71f13
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg3.cu
@@ -0,0 +1,32 @@
+/**
+ * @file cmp3g.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 3)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T)                                                                        \
+    bool thrustgpu_error_bounded_T##Tliteral(                                                                       \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)                          \
+    {                                                                                                               \
+        return psz::detail::thrustgpu_error_bounded<T>(a, b, len, eb, first_faulty_idx);                            \
+    }                                                                                                               \
+                                                                                                                    \
+    template <>                                                                                                     \
+    bool psz::thrustgpu_error_bounded<T>(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \
+    {                                                                                                               \
+        return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx);                                \
+    }
+
+THRUSTGPU_COMPARE_LOSSY(fp32, float);
+THRUSTGPU_COMPARE_LOSSY(fp64, double);
+
+#undef THRUSTGPU_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_1.cu b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
new file mode 100644
index 00000000..34d74884
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_1.cu
@@ -0,0 +1,24 @@
+/**
+ * @file cmpg4_1.cu
+ * @author Jiannan Tian
+ * @brief (split to speed up buid process; part 4)
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_2.cu b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
new file mode 100644
index 00000000..73dcde1f
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_2.cu
@@ -0,0 +1,25 @@
+/**
+ * @file cmpg4_2.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp32, float);
+
+#undef THRUSTGPU_ASSESS
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_3.cu b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
new file mode 100644
index 00000000..bbca7c6c
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_3.cu
@@ -0,0 +1,24 @@
+/**
+ * @file cmpg4_3.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                              \
+    void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                              \
+        psz::detail::thrustgpu_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
+#undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/cmpg4_4.cu b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
new file mode 100644
index 00000000..d60b8b97
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/cmpg4_4.cu
@@ -0,0 +1,25 @@
+/**
+ * @file cmpg4_4.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-11-03
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_gpu.inl"
+#include "stat/compare.h"
+#include "stat/compare_gpu.hh"
+
+#define THRUSTGPU_ASSESS(Tliteral, T)                                                             \
+    template <>                                                                                   \
+    void psz::thrustgpu_assess_quality<T>(cusz_stats * s, T * xdata, T * odata, size_t const len) \
+    {                                                                                             \
+        thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len);                               \
+    }
+
+THRUSTGPU_ASSESS(fp64, double);
+
+#undef THRUSTGPU_ASSESS
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/stat/compare_cpu.cc b/qtensor/compression/cusz/src/stat/compare_cpu.cc
new file mode 100644
index 00000000..8a22dbe3
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/compare_cpu.cc
@@ -0,0 +1,43 @@
+/**
+ * @file _compare.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-09
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../detail/compare_cpu.inl"
+#include "stat/compare.h"
+
+#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T)                          \
+    bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \
+    {                                                                 \
+        return psz::detail::cppstd_identical<T>(d1, d2, len);         \
+    }
+
+#define CPPSTD_COMPARE_LOSSY(Tliteral, T)                                                       \
+    bool cppstd_error_bounded_T##Tliteral(                                                      \
+        T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr)      \
+    {                                                                                           \
+        return psz::detail::cppstd_error_bounded<T>(a, b, len, eb, first_faulty_idx);           \
+    }                                                                                           \
+                                                                                                \
+    void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \
+    {                                                                                           \
+        psz::detail::cppstd_assess_quality<T>(s, xdata, odata, len);                            \
+    }
+
+CPPSTD_COMPARE_LOSSLESS(fp32, float)
+CPPSTD_COMPARE_LOSSLESS(fp64, double)
+CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t)
+CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t)
+CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t)
+
+CPPSTD_COMPARE_LOSSY(fp32, float)
+CPPSTD_COMPARE_LOSSY(fp64, double)
+
+#undef CPPSTD_COMPARE_LOSSLESS
+#undef CPPSTD_COMPARE_LOSSY
diff --git a/qtensor/compression/cusz/src/stat/stat.cc b/qtensor/compression/cusz/src/stat/stat.cc
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/cusz/src/stat/stat_g.cu b/qtensor/compression/cusz/src/stat/stat_g.cu
new file mode 100644
index 00000000..c3c18c12
--- /dev/null
+++ b/qtensor/compression/cusz/src/stat/stat_g.cu
@@ -0,0 +1,96 @@
+/**
+ * @file stat_g.cu
+ * @author Cody Rivera, Jiannan Tian
+ * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper
+ * @version 0.3
+ * @date 2022-11-02
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "../kernel/detail/hist.inl"
+
+#include "cusz/type.h"
+#include "stat/stat.h"
+#include "stat/stat_g.hh"
+
+template <typename T>
+cusz_error_status asz::stat::histogram(
+    T*           in_data,
+    size_t const in_len,
+    uint32_t*    out_freq,
+    int const    num_buckets,
+    float*       milliseconds,
+    cudaStream_t stream)
+{
+    int device_id, max_bytes, num_SMs;
+    int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use;
+
+    cudaGetDevice(&device_id);
+    cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id);
+
+    auto query_maxbytes = [&]() {
+        int max_bytes_opt_in;
+        cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
+
+        // account for opt-in extra shared memory on certain architectures
+        cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+        max_bytes = std::max(max_bytes, max_bytes_opt_in);
+
+        // config kernel attribute
+        cudaFuncSetAttribute(
+            kernel::p2013Histogram<T, cusz::FREQ>, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes);
+    };
+
+    auto optimize_launch = [&]() {
+        items_per_thread = 1;
+        r_per_block      = (max_bytes / sizeof(int)) / (num_buckets + 1);
+        grid_dim         = num_SMs;
+        // fits to size
+        block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+        while (block_dim > 1024) {
+            if (r_per_block <= 1) { block_dim = 1024; }
+            else {
+                r_per_block /= 2;
+                grid_dim *= 2;
+                block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64;
+            }
+        }
+        shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int);
+    };
+
+    query_maxbytes();
+    optimize_launch();
+
+    CREATE_CUDAEVENT_PAIR;
+    START_CUDAEVENT_RECORDING(stream);
+
+    kernel::p2013Histogram<<<grid_dim, block_dim, shmem_use, stream>>>  //
+        (in_data, out_freq, in_len, num_buckets, r_per_block);
+
+    STOP_CUDAEVENT_RECORDING(stream);
+
+    cudaStreamSynchronize(stream);
+    TIME_ELAPSED_CUDAEVENT(milliseconds);
+    DESTROY_CUDAEVENT_PAIR;
+
+    return CUSZ_SUCCESS;
+}
+
+#define INIT_HIST_AND_C(Tname, T)                                                                                     \
+    template cusz_error_status asz::stat::histogram<T>(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \
+                                                                                                                      \
+    cusz_error_status histogram_T##Tname(                                                                             \
+        T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds,              \
+        cudaStream_t stream)                                                                                          \
+    {                                                                                                                 \
+        return asz::stat::histogram<T>(in_data, in_len, out_freq, num_buckets, milliseconds, stream);                 \
+    }
+
+INIT_HIST_AND_C(ui8, uint8_t)
+INIT_HIST_AND_C(ui16, uint16_t)
+INIT_HIST_AND_C(ui32, uint32_t)
+INIT_HIST_AND_C(ui64, uint64_t)
+
+#undef INIT_HIST_AND_C
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/dbg_print.cuh b/qtensor/compression/cusz/src/utils/dbg_print.cuh
new file mode 100644
index 00000000..2c2b5580
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/dbg_print.cuh
@@ -0,0 +1,132 @@
+#ifndef UTILS_DBG_PRINT_CUH
+#define UTILS_DBG_PRINT_CUH
+
+/**
+ * @file dbg_print.cuh
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.2
+ * @date 2020-09-20
+ * Created on 2020-03-17
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+template <typename Q, int PART_SIZE>
+__global__ void print_deflated(Q* coded, size_t gid)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != gid) return;
+    printf("print after deflating\n");
+    //    for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); });
+    for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); }
+    printf("\n");
+}
+
+template <typename T>
+__global__ void print_histogram(T* freq, size_t size, size_t radius = 20)
+{
+    const int DICT_SIZE = size; /* Dynamic sizing */
+    if (blockIdx.x * blockDim.x + threadIdx.x == 0) {
+        for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) {
+            if (i % 10 == 0) printf("\n");
+            printf("%4lu: %-12lu", i, static_cast<size_t>(freq[i]));
+        }
+        printf("\n");
+    }
+}
+
+template <typename T>
+__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) {
+        printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u);
+        if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+// MSB to LSB
+template <typename T>
+__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n')
+{
+    for (size_t j = 0; j < bitwidth; j++) {
+        printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u);
+        if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep);
+    }
+    printf("%c", ending);
+}
+
+template <typename T>
+__device__ __host__ void snippet_print_bitset_full(T num)
+{
+    print_by_type(num, '_', '\t');
+    size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1);
+    //    size_t code_bitwidth = ((static_cast<T>(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8);
+    printf("len: %3lu\tcode: ", bitwidth);
+    print_code_only<T>(num, bitwidth, '\0', '\n');
+}
+
+template <typename T>
+__global__ void print_codebook(T* codebook, size_t len)
+{
+    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+    printf("--------------------------------------------------------------------------------\n");
+    printf("printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+    __shared__ T buffer;
+    for (size_t i = 0; i < len; i++) {
+        buffer = codebook[i];
+        if (buffer == ~((T)0x0)) continue;
+        printf("%5lu\t", i);
+        snippet_print_bitset_full(buffer);
+    }
+    printf("--------------------------------------------------------------------------------\n");
+    printf("done printing codebook\n");
+    printf("--------------------------------------------------------------------------------\n");
+}
+
+template <typename T>
+__global__ void get_entropy(T* freq)
+{
+}
+
+// TODO real GPU version
+template <typename T, typename Q>
+__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len)
+{
+}
+
+// template <typename T>
+//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) {
+//    if (blockIdx.x * blockDim.x + threadIdx.x != 0) return;
+//    printf("print Huffman coded before it is deflated\n");
+//    for (size_t i = 0; i < 200; i++) {
+//        if (coded[i] == ~((T)0x0)) continue;
+//        printf("%5lu\t", i);
+//        snippet_print_bitset_full(coded[i]);
+//    }
+//    printf("\n");
+//}
+
+template <typename T>
+__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len)
+{
+    if (blockIdx.x != 0) return;
+    size_t gid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (coded[gid] == ~((T)0x0)) return;
+    printf("%5lu\t", gid);
+    snippet_print_bitset_full(coded[gid]);
+
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //    printf("print Huffman coded before it is deflated\n");
+    //    for (size_t i = 0; i < 200; i++) {
+    //        if (coded[i] == ~((T)0x0)) continue;
+    //        printf("%5lu\t", i);
+    //        snippet_print_bitset_full(coded[i]);
+    //    }
+    //    printf("\n");
+}
+
+#endif
\ No newline at end of file
diff --git a/qtensor/compression/cusz/src/utils/print_gpu.cu b/qtensor/compression/cusz/src/utils/print_gpu.cu
new file mode 100644
index 00000000..2d2b195f
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/print_gpu.cu
@@ -0,0 +1,121 @@
+/**
+ * @file print_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-09-23
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+// #include "../detail/print_gpu.inl"
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include "utils/print_gpu.h"
+#include "utils/print_gpu.hh"
+
+#define PRINT_INT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                  \
+    {                                                                                                       \
+        thrust::for_each(                                                                                   \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \
+        printf("\n");                                                                                       \
+    }
+
+PRINT_INT_LESS_THAN_64(i8, int8_t)
+PRINT_INT_LESS_THAN_64(i16, int16_t)
+PRINT_INT_LESS_THAN_64(i32, int32_t)
+
+void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); });
+    printf("\n");
+}
+
+#define PRINT_UINT_LESS_THAN_64(Tliteral, T)                                                                 \
+    void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset)                                   \
+    {                                                                                                        \
+        thrust::for_each(                                                                                    \
+            thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \
+        printf("\n");                                                                                        \
+    }
+
+PRINT_UINT_LESS_THAN_64(ui8, uint8_t)
+PRINT_UINT_LESS_THAN_64(ui16, uint16_t)
+PRINT_UINT_LESS_THAN_64(ui32, uint32_t)
+
+void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); });
+    printf("\n");
+}
+
+void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset)
+{
+    thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); });
+    printf("\n");
+}
+
+template <typename T>
+void psz::peek_device_data(T* d_arr, size_t num, size_t offset)
+{
+    if (std::is_same<T, int8_t>::value) {  //
+        peek_device_data_Ti8((int8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int16_t>::value) {
+        peek_device_data_Ti16((int16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int32_t>::value) {
+        peek_device_data_Ti32((int32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, int64_t>::value) {
+        peek_device_data_Ti64((int64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint8_t>::value) {
+        peek_device_data_Tui8((uint8_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint16_t>::value) {
+        peek_device_data_Tui16((uint16_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint32_t>::value) {
+        peek_device_data_Tui32((uint32_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, uint64_t>::value) {
+        peek_device_data_Tui64((uint64_t*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, float>::value) {
+        peek_device_data_Tfp32((float*)d_arr, num, offset);
+    }
+    else if (std::is_same<T, double>::value) {
+        peek_device_data_Tfp64((double*)d_arr, num, offset);
+    }
+    else {
+        std::runtime_error("peek_device_data cannot accept this type.");
+    }
+}
+
+#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data<T>(T * d_arr, size_t num, size_t offset);
+
+CPP_PEEK(i8, int8_t);
+CPP_PEEK(i16, int16_t);
+CPP_PEEK(i32, int32_t);
+CPP_PEEK(i64, int64_t);
+CPP_PEEK(ui8, uint8_t);
+CPP_PEEK(ui16, uint16_t);
+CPP_PEEK(ui32, uint32_t);
+CPP_PEEK(ui64, uint64_t);
+CPP_PEEK(fp32, float);
+CPP_PEEK(fp64, double);
+
+#undef CPP_PEEK
+
+#undef PRINT_INT_LESS_THAN_64
+#undef PRINT_UINT_LESS_THAN_64
diff --git a/qtensor/compression/cusz/src/utils/timer_cpu.cc b/qtensor/compression/cusz/src/utils/timer_cpu.cc
new file mode 100644
index 00000000..2422f6f2
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/timer_cpu.cc
@@ -0,0 +1,30 @@
+/**
+ * @file timer_cpu.cc
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include "utils/timer.h"
+
+#include <chrono>
+#include <utility>
+
+using hires         = std::chrono::high_resolution_clock;
+using duration_t    = std::chrono::duration<double>;
+using hires_clock_t = std::chrono::time_point<hires>;
+
+struct asz_timer {
+    hires_clock_t start, stop;
+};
+
+// cpu timer specific
+asz_timer* asz_cputimer_create() { return new asz_timer; }
+void       asz_cputimer_destroy(asz_timer* t) { delete t; }
+void       asz_cputimer_start(asz_timer* t) { t->start = hires::now(); }
+void       asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); }
+double     asz_cputime_elapsed(asz_timer* t) { return static_cast<duration_t>((t->stop) - (t->start)).count(); }
diff --git a/qtensor/compression/cusz/src/utils/timer_gpu.cu b/qtensor/compression/cusz/src/utils/timer_gpu.cu
new file mode 100644
index 00000000..247c80f8
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/timer_gpu.cu
@@ -0,0 +1,82 @@
+/**
+ * @file timer_gpu.cu
+ * @author Jiannan Tian
+ * @brief
+ * @version 0.3
+ * @date 2022-10-31
+ *
+ * (C) 2022 by Indiana University, Argonne National Laboratory
+ *
+ */
+
+#include <cuda_runtime.h>
+#include <iostream>
+#include "utils/timer.h"
+
+typedef struct asz_cudatimer {
+    cudaEvent_t  a, b;
+    float        milliseconds;
+    cudaStream_t stream;
+
+    asz_cudatimer() { create(); }
+    asz_cudatimer(cudaStream_t stream)
+    {
+        create();
+        this->stream = stream;
+    }
+
+    void create()
+    {
+        cudaEventCreate(&a);
+        cudaEventCreate(&b);
+    }
+
+    void destroy()
+    {
+        cudaEventDestroy(a);
+        cudaEventDestroy(b);
+    }
+
+    // stream not involved
+    void start() { cudaEventRecord(a); }
+
+    void stop()
+    {
+        cudaEventRecord(b);
+        cudaEventSynchronize(b);
+    }
+
+    // stream involved
+    void stream_start()
+    {
+        cudaEventRecord(a, stream);  // set event as not occurred
+    }
+
+    void stream_stop()
+    {
+        cudaEventRecord(b, stream);
+        cudaEventSynchronize(b);  // block host until `stream` meets `stop`
+    }
+
+    // get time
+    float time_elapsed()
+    {
+        cudaEventElapsedTime(&milliseconds, a, b);
+        std::cout << "milliseconds: " << milliseconds << std::endl;
+        return milliseconds;
+    }
+} asz_cudatimer;
+
+// cuda timer specific
+asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; }
+void           asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudatimer_start(asz_cudatimer* t) { t->start(); }
+void           asz_cudatimer_end(asz_cudatimer* t) { t->stop(); }
+double         asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
+
+// cuda streamtimer specific
+asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); }
+void           asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); }
+void           asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); }
+void           asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); }
+double         asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; }
diff --git a/qtensor/compression/cusz/src/utils/vis_stat.hh b/qtensor/compression/cusz/src/utils/vis_stat.hh
new file mode 100644
index 00000000..ff27695f
--- /dev/null
+++ b/qtensor/compression/cusz/src/utils/vis_stat.hh
@@ -0,0 +1,137 @@
+#ifndef UTILS_VIS_STAT_HH
+#define UTILS_VIS_STAT_HH
+
+/**
+ * @file vis_stat.hh
+ * @author Jiannan Tian
+ * @brief Analysis and visualization of datum.
+ * @version 0.1
+ * @date 2020-09-20
+ * Created on 2020-02-09
+ *
+ * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory
+ * See LICENSE in top-level directory
+ *
+ */
+
+#include <cmath>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::tuple;
+
+template <typename T>
+double GetEntropy(T* code, size_t l, size_t cap = 1024)
+{
+    if (cap == 0) {
+        cerr << "wrong cap" << endl;
+        exit(-1);
+    }
+    auto arr = new size_t[cap]();
+    for (size_t i = 0; i < l; i++) arr[code[i]]++;
+    std::vector<double> raw(arr, arr + cap);
+    std::vector<double> frequencies;
+    std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; });
+    double entropy = 0;
+    for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); }
+
+    //    cout << "entropy:\t" << entropy << endl;
+    delete[] arr;
+    return entropy;
+}
+
+// TODO automatically omit bins that are less than 1%
+template <typename T>
+void VisualizeHistogram(
+    const std::string& tag,
+    T*                 _d_POD,
+    size_t             l,
+    size_t             _bins                   = 16,
+    bool               log_freq                = false,
+    double             override_min            = 0,
+    double             override_max            = 0,
+    bool               eliminate_zeros         = false,
+    bool               use_scientific_notation = true)
+{
+    std::vector<T> _d(_d_POD, _d_POD + l);
+    std::vector<T> _d_nonzero;
+    //    std::vector<size_t> arr;
+    //    arr.reserve(_bins);
+    //    for (size_t i = 0; i< _bins; i++) arr.push_back(0);
+    auto arr = new size_t[_bins]();
+
+    if (eliminate_zeros) {
+        std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; });
+    }
+    double Min = *std::min_element(_d.begin(), _d.end());
+    double Max = *std::max_element(_d.begin(), _d.end());
+    //    double sum = std::accumulate(_d.begin(), _d.end(), 0);
+    double rng = Max - Min;
+    //    double avg = sum / l;
+
+    cout << "\e[7m[[" << tag << "]]\e[0m";
+    if (override_max > override_min) {
+        cout << "zoom into " << override_min << "--" << override_max << endl;
+        std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min);
+    }
+    double step = rng / _bins;
+    for (size_t i = 0; i < l; i++) arr[static_cast<size_t>((_d[i] - Min) / step)]++;
+    std::vector<size_t> _viz(arr, arr + _bins);
+    //    std::vector<size_t> _viz(arr);
+
+    // visualization
+    printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step);
+    //    printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng);
+    cout << "count:\t" << l << "\t";
+    cout << "min:\t" << Min << "\t";
+    cout << "max:\t" << Max << "\t";
+    cout << "rng:\t" << rng << endl;
+
+    if (log_freq) {
+        cout << "using log_freq" << endl;
+        std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); });
+    }
+
+    size_t longest     = *std::max_element(_viz.begin(), _viz.end());
+    size_t bar_str_len = 64;  // scale according to the longest
+    std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) {
+        n = static_cast<size_t>(n / static_cast<double>(longest) * bar_str_len);
+    });
+
+    for (size_t i = 0; i < _bins; i++) {
+        // normalize to width
+        cout << "|"
+             << "\33[43m";
+
+        for (size_t j = 0; j < bar_str_len + 1; j++) {
+            if (j < _viz[i])
+                cout << "-";
+            else if (j == _viz[i])
+                cout << "\33[0m"
+                     << "+";
+            else
+                cout << " ";
+        }
+        cout.precision(2);
+        cout << "    ";
+        if (use_scientific_notation) cout << std::scientific;
+        cout << Min + i * step << " -- " << Min + (i + 1) * step;
+        cout << "  ";
+        cout << std::setw((int)log10(l) + 2);
+        cout << arr[i];
+        cout << "   ";
+        cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast<double>(l) * 100 << "%" << endl;
+    }
+    cout << endl;
+    //    delete[] arr;
+}
+
+#endif
diff --git a/qtensor/compression/cuszp/cuSZp/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
new file mode 100644
index 00000000..d6b24117
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Specify the minimum version of CMake required to build the project
+cmake_minimum_required(VERSION 3.21)
+
+project(cuSZp
+        VERSION 0.0.2
+        DESCRIPTION "Error-bounded GPU lossy compression library"
+        )
+set(namespace "cuSZp")
+enable_language(CXX)
+enable_language(CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+#set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -debug -Wall -diag-disable=10441")
+#set(CMAKE_CXX_FLAGS_RELEASE "-diag-disable=10441 -g -ftz -fma -O2 -fp-model precise -prec-div -Wall")
+
+#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -ftz=true -G -allow-unsupported-compiler")
+#set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -allow-unsupported-compiler")
+
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_CUDA_STANDARD "17")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+#set(CMAKE_CUDA_FLAGS_INIT "-std=c++17 -allow-unsupported-compiler")
+set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 75)
+set(CUDA_PROPAGATE_HOST_FLAGS ON)
+set(CUDA_LIBRARY CUDA::cudart)
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY VALUE Release)
+endif()
+
+add_library(${PROJECT_NAME} STATIC)
+
+target_sources(${PROJECT_NAME}
+        PRIVATE
+        src/cuSZp_f32.cu
+        src/cuSZp_f64.cu
+        src/cuSZp_utility.cu
+        src/cuSZp_timer.cu
+        src/cuSZp_entry_f32.cu
+        src/cuSZp_entry_f64.cu
+        )
+
+target_include_directories(${PROJECT_NAME}
+        PRIVATE
+        # where the library itself will look for its internal headers
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        PUBLIC
+        # where top-level project will look for the library's public headers
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        # where external projects will look for the library's public headers
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+        )
+
+#target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+
+set(public_headers
+        include/cuSZp_f32.h
+        include/cuSZp_f64.h
+        include/cuSZp_utility.h
+        include/cuSZp_timer.h
+        include/cuSZp_entry_f32.h
+        include/cuSZp_entry_f64.h
+        )
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+include(Installing)
+
+option(CUSZP_BUILD_EXAMPLES "Option to enable building example programs" ON)
+if (CUSZP_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/Config.cmake.in b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
new file mode 100644
index 00000000..8c9ad12a
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/Config.cmake.in
@@ -0,0 +1,5 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
+
+check_required_components(@PROJECT_NAME@)
diff --git a/qtensor/compression/cuszp/cuSZp/LICENSE b/qtensor/compression/cuszp/cuSZp/LICENSE
new file mode 100644
index 00000000..786f3f5e
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/LICENSE
@@ -0,0 +1,30 @@
+Copyright © 2023, UChicago Argonne and University of Iowa
+
+All Rights Reserved
+
+Software Name: cuSZp: An Ultra-fast GPU Error-bounded Lossy Compressor with Optimized End-to-End Performance
+
+By: Argonne National Laboratory, University of Iowa
+
+OPEN SOURCE LICENSE
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+******************************************************************************************************
+                                              DISCLAIMER
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************************************
+
+Contact: SZ Team (szlossycompressor@gmail.com)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/README.md b/qtensor/compression/cuszp/cuSZp/README.md
new file mode 100644
index 00000000..4f9f090d
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/README.md
@@ -0,0 +1,106 @@
+# cuSZp
+<a href="./LICENSE"><img src="https://img.shields.io/badge/License-BSD%203--Clause-blue.svg"></a> 
+
+cuSZp is a user-friendly error-bounded lossy compression tool specifically designed for the compression of single- and double-precision floating-point data using NVIDIA GPUs. 
+This tool fuses all compression or decompression computations into one single kernel, achieving ultra fast end-to-end throughput.
+Specifically, the cuSZp framework is structured around four pivotal stages: Quantization and Prediction, Fixed-length Encoding, Global Synchronization, and Block Bit-shuffling. 
+Noting that ongoing optimization efforts are being devoted to cuSZp, aimed at further improving its end-to-end performance.
+
+- Developer: Yafan Huang
+- Contributors: Sheng Di, Xiaodong Yu, Guanpeng Li, and Franck Cappello
+
+## Environment Requirements
+- Linux OS with NVIDIA GPUs
+- Git >= 2.15
+- CMake >= 3.21
+- Cuda Toolkit >= 11.0
+- GCC >= 7.3.0
+
+## Compile and Run cuSZp Prepared Executable Binary
+You can compile and install cuSZp with following commands:
+```shell
+$ git clone https://github.com/szcompressor/cuSZp.git
+$ cd cuSZp
+$ mkdir build && cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../install/ ..
+$ make -j
+$ make install
+```
+After compilation, you will see a list of executable binaries ```cuSZp/install/bin/```:
+- ```cuSZp_cpu_f32_api```: single-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f32_api```: single-precision, device pointers (i.e. on GPU).
+- ```cuSZp_cpu_f64_api```: double-precision, host pointers (i.e. on CPU).
+- ```cuSZp_gpu_f64_api```: double-precision, device pointers (i.e. on GPU).
+
+To use those binaries, try following commands. 
+We here use RTM pressure_2000 dataset (1.4 GB, 1008x1008x352) for single-precision example, and NWChem acd-tst.bin.d64 (6.0 GB) for double-precision example.
+```shell
+# Example for single-precision API
+# ./cuSZp_gpu_f32_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f32_api ./pressure_2000 REL 1e-4
+cuSZp finished!
+cuSZp compression   end-to-end speed: 151.564649 GB/s
+cuSZp decompression end-to-end speed: 232.503219 GB/s
+cuSZp compression ratio: 13.003452
+
+Pass error check!
+$
+# Example for double-precision API
+# ./cuSZp_gpu_f64_api TARGET_HPC_DATASET ERROR_MODE ERROR_BOUND
+#                                        ABS or REL
+$ ./cuSZp_gpu_f64_api ./acd-tst.bin.d64 ABS 1E-8
+cuSZp finished!
+cuSZp compression   end-to-end speed: 110.117965 GB/s
+cuSZp decompression end-to-end speed: 222.743097 GB/s
+cuSZp compression ratio: 3.990585
+
+Pass error check!
+```
+More HPC dataset can be downloaded from [SDRBench](https://sdrbench.github.io/).
+
+## Using cuSZp as an Internal API
+This repository provides several examples for using cuSZp compression and decompression for different scenarios (device pointer? host pointer? f32 or f64?).
+The examples can be found in ```cuSZp/examples/```.
+Assuming your original data, compressed data, and reconstructed data are all device pointers (allocated on GPU), and the data type is single-precision. The compression and decompression APIs can be called as below:
+```C++
+// For measuring the end-to-end throughput.
+TimingGPU timer_GPU;
+
+// cuSZp compression.
+timer_GPU.StartCounter(); // set timer
+SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+float cmpTime = timer_GPU.GetCounter();
+
+// cuSZp decompression.
+timer_GPU.StartCounter(); // set timer
+SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+float decTime = timer_GPU.GetCounter();
+```
+More details can be checked in:
+- **f32-hostptr**: ```cuSZp/examples/cuSZp_cpu_f32_api.cpp```.
+- **f32-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f32_api.cpp```.
+- **f64-hostptr**: ```cuSZp/examples/cuSZp_cpu_f64_api.cpp```.
+- **f64-deviceptr**: ```cuSZp/examples/cuSZp_gpu_f64_api.cpp```.
+
+## Citation
+```bibtex
+@inproceedings{cuSZp2023huang,
+      title = {cuSZp: An Ultra-Fast GPU Error-Bounded Lossy Compression Framework with Optimized End-to-End Performance}
+     author = {Huang, Yafan and Di, Sheng and Yu, Xiaodong and Li, Guanpeng and Cappello, Franck},
+       year = {2023},
+       isbn = {979-8-4007-0109-2/23/11},
+  publisher = {Association for Computing Machinery},
+    address = {Denver, CO, USA},
+        doi = {10.1145/3581784.3607048},
+  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+   keywords = {Lossy compression; parallel computing; HPC; GPU},
+     series = {SC'23}
+}
+```
+
+## Copyright
+(C) 2023 by Argonne National Laboratory and University of Iowa. More details see [COPYRIGHT](https://github.com/szcompressor/cuSZp/blob/master/LICENSE).
+
+## Acknowledgement
+This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations – the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation’s exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, Advanced Scientific Computing Research (ASCR), under contract DE-AC02-06CH11357, and supported by the National Science Foundation under Grant OAC-2003709 and OAC-2104023. We acknowledge the computing resources provided on Bebop (operated by Laboratory Computing Resource Center at Argonne) and on Theta and JLSE (operated by Argonne Leadership Computing Facility). We acknowledge the support of ARAMCO. 
diff --git a/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
new file mode 100644
index 00000000..cd5a27d0
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/cmake/Installing.cmake
@@ -0,0 +1,67 @@
+include(GNUInstallDirs)
+
+if(DEFINED CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX is not set\n"
+            "Default value: ${CMAKE_INSTALL_PREFIX}\n"
+            "Will set it to ${CMAKE_SOURCE_DIR}/install"
+    )
+    set(CMAKE_INSTALL_PREFIX
+            "${CMAKE_SOURCE_DIR}/install"
+            CACHE PATH "Where the library will be installed to" FORCE
+            )
+else()
+    message(
+            STATUS
+            "CMAKE_INSTALL_PREFIX was already set\n"
+            "Current value: ${CMAKE_INSTALL_PREFIX}"
+    )
+endif()
+
+set_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER "${public_headers}")
+
+set_target_properties(${PROJECT_NAME} PROPERTIES DEBUG_POSTFIX "d")
+
+install(TARGETS ${PROJECT_NAME}
+        EXPORT "${PROJECT_NAME}Targets"
+        # these get default values from GNUInstallDirs, no need to set them
+        #RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # bin
+        #LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        #ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} # lib
+        # except for public headers, as we want them to be inside a library folder
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} # include/SomeProject
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} # include
+        )
+
+# generate and install export file
+install(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        DESTINATION cmake
+        )
+
+include(CMakePackageConfigHelpers)
+
+# generate the version file for the config file
+write_basic_package_version_file(
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        VERSION "${version}"
+        COMPATIBILITY AnyNewerVersion
+)
+# create config file
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        INSTALL_DESTINATION cmake
+        )
+# install config files
+install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+        DESTINATION cmake
+        )
+# generate the export targets for the build tree
+export(EXPORT "${PROJECT_NAME}Targets"
+        FILE "${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Targets.cmake"
+        NAMESPACE ${namespace}::
+        )
diff --git a/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
new file mode 100644
index 00000000..8de5b50d
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Find CUDA package
+find_package(CUDA REQUIRED)
+
+set(install_dir ${PROJECT_BINARY_DIR}/examples/bin)
+set(execName_gpu_f32 "cuSZp_gpu_f32_api")
+set(execName_cpu_f32 "cuSZp_cpu_f32_api")
+set(execName_gpu_f64 "cuSZp_gpu_f64_api")
+set(execName_cpu_f64 "cuSZp_cpu_f64_api")
+set(SRC_DIR ${PROJECT_SOURCE_DIR}/src)
+set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
+
+# Add include and library directories
+include_directories(${INCLUDE_DIR})
+
+# Compile headers as a library
+cuda_add_library(cuSZp_libs STATIC ${SRC_DIR}/cuSZp_f32.cu
+                                   ${SRC_DIR}/cuSZp_f64.cu 
+                                   ${SRC_DIR}/cuSZp_utility.cu
+                                   ${SRC_DIR}/cuSZp_timer.cu
+                                   ${SRC_DIR}/cuSZp_entry_f32.cu
+                                   ${SRC_DIR}/cuSZp_entry_f64.cu)
+
+# Compile executable binary
+cuda_add_executable(${execName_gpu_f32} cuSZp_gpu_f32_api.cpp)
+cuda_add_executable(${execName_cpu_f32} cuSZp_cpu_f32_api.cpp)
+cuda_add_executable(${execName_gpu_f64} cuSZp_gpu_f64_api.cpp)
+cuda_add_executable(${execName_cpu_f64} cuSZp_cpu_f64_api.cpp)
+
+# Link with headers
+target_link_libraries(${execName_gpu_f32} cuSZp_libs)
+target_link_libraries(${execName_cpu_f32} cuSZp_libs)
+target_link_libraries(${execName_gpu_f64} cuSZp_libs)
+target_link_libraries(${execName_cpu_f64} cuSZp_libs)
+
+# Set output paths for the compiled binary
+set_target_properties(${execName_gpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f32} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_gpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+set_target_properties(${execName_cpu_f64} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${install_dir})
+
+# Set installation paths for the compiled binary.
+install(TARGETS ${execName_gpu_f32} DESTINATION bin)
+install(TARGETS ${execName_cpu_f32} DESTINATION bin)
+install(TARGETS ${execName_gpu_f64} DESTINATION bin)
+install(TARGETS ${execName_cpu_f64} DESTINATION bin)
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
new file mode 100644
index 00000000..e4d63c27
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f32_api.cpp
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f32(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f32(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
new file mode 100644
index 00000000..5dcf6788
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_cpu_f64_api.cpp
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_cpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_cpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_cpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // Input data preparation.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // cuSZp compression.
+    SZp_compress_hostptr_f64(oriData, cmpBytes, nbEle, &cmpSize, errorBound);
+    
+    // cuSZp decompression.
+    SZp_decompress_hostptr_f64(decData, cmpBytes, nbEle, cmpSize, errorBound);
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("compression ratios: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
new file mode 100644
index 00000000..96722d2b
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f32_api.cpp
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f32_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f32_api testfloat_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f32_api testfloat_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    float errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    float* oriData = NULL;
+    float* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readFloatData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (float*)malloc(nbEle*sizeof(float));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(float));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        float max_val = oriData[0];
+        float min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    float* d_oriData;
+    float* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(float)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f32(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f32(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(float)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(float)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
new file mode 100644
index 00000000..7af2f303
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/examples/cuSZp_gpu_f64_api.cpp
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuSZp_utility.h>
+#include <cuSZp_entry_f64.h>
+#include <cuSZp_timer.h>
+
+int main(int argc, char* argv[])
+{
+    // Read input information.
+    char oriFilePath[640];
+    char errorMode[20];
+    int status=0;
+    if(argc != 4)
+    {
+        printf("Usage: cuSZp_gpu_f64_api [srcFilePath] [errorMode] [errBound] # errorMode can only be ABS or REL\n");
+        printf("Example: cuSZp_gpu_f64_api testdouble_8_8_128.dat ABS 1E-2     # compress dataset with absolute 1E-2 error bound\n");
+        printf("         cuSZp_gpu_f64_api testdouble_8_8_128.dat REL 1e-3     # compress dataset with relative 1E-3 error bound\n");
+        exit(0);
+    }
+    sprintf(oriFilePath, "%s", argv[1]);
+    sprintf(errorMode, "%s", argv[2]);
+    double errorBound = atof(argv[3]);
+
+    // For measuring the end-to-end throughput.
+    TimingGPU timer_GPU;
+
+    // Input data preparation on CPU.
+    double* oriData = NULL;
+    double* decData = NULL;
+    unsigned char* cmpBytes = NULL;
+    size_t nbEle = 0;
+    size_t cmpSize = 0;
+    oriData = readDoubleData_Yafan(oriFilePath, &nbEle, &status);
+    decData = (double*)malloc(nbEle*sizeof(double));
+    cmpBytes = (unsigned char*)malloc(nbEle*sizeof(double));
+
+    // Generating error bounds.
+    if(strcmp(errorMode, "REL")==0)
+    {
+        double max_val = oriData[0];
+        double min_val = oriData[0];
+        for(size_t i=0; i<nbEle; i++)
+        {
+            if(oriData[i]>max_val)
+                max_val = oriData[i];
+            else if(oriData[i]<min_val)
+                min_val = oriData[i];
+        }
+        errorBound = errorBound * (max_val - min_val);
+    }
+    else if(strcmp(errorMode, "ABS")!=0)
+    {
+        printf("invalid errorMode! errorMode can only be ABS or REL.\n");
+        exit(0);
+    }
+
+    // Input data preparation on GPU.
+    double* d_oriData;
+    double* d_decData;
+    unsigned char* d_cmpBytes;
+    size_t pad_nbEle = (nbEle + 262144 - 1) / 262144 * 262144; // A temp demo, will add more block sizes in future implementation.
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpBytes, sizeof(double)*pad_nbEle);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // Just a warmup.
+    for(int i=0; i<3; i++)
+        SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+
+    // cuSZp compression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_compress_deviceptr_f64(d_oriData, d_cmpBytes, nbEle, &cmpSize, errorBound, stream);
+    float cmpTime = timer_GPU.GetCounter();
+    
+    // cuSZp decompression.
+    timer_GPU.StartCounter(); // set timer
+    SZp_decompress_deviceptr_f64(d_decData, d_cmpBytes, nbEle, cmpSize, errorBound, stream);
+    float decTime = timer_GPU.GetCounter();
+
+    // Print result.
+    printf("cuSZp finished!\n");
+    printf("cuSZp compression   end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/cmpTime);
+    printf("cuSZp decompression end-to-end speed: %f GB/s\n", (nbEle*sizeof(double)/1024.0/1024.0)/decTime);
+    printf("cuSZp compression ratio: %f\n\n", (nbEle*sizeof(double)/1024.0/1024.0)/(cmpSize*sizeof(unsigned char)/1024.0/1024.0));
+
+    // Error check
+    cudaMemcpy(cmpBytes, d_cmpBytes, cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+    int not_bound = 0;
+    for(size_t i=0; i<nbEle; i+=1)
+    {
+        if(abs(oriData[i]-decData[i]) > errorBound*1.1)
+        {
+            not_bound++;
+            // printf("not bound: %zu oriData: %f, decData: %f, errors: %f, bound: %f\n", i, oriData[i], decData[i], abs(oriData[i]-decData[i]), errBound);
+        }
+    }
+    if(!not_bound) printf("\033[0;32mPass error check!\033[0m\n");
+    else printf("\033[0;31mFail error check!\033[0m\n");
+    
+    // Free allocated data.
+    free(oriData);
+    free(decData);
+    free(cmpBytes);
+    cudaFree(d_oriData);
+    cudaFree(d_decData);
+    cudaFree(d_cmpBytes);
+    cudaStreamDestroy(stream);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
new file mode 100644
index 00000000..5b77d73c
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f32.h
@@ -0,0 +1,11 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
new file mode 100644
index 00000000..6a591acd
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_entry_f64.h
@@ -0,0 +1,11 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound);
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound);
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream = 0);
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
new file mode 100644
index 00000000..c69d349a
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f32.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_F32_H
+#define CUSZP_INCLUDE_CUSZP_F32_H
+
+static const int cmp_tblock_size_f32 = 32;
+static const int dec_tblock_size_f32 = 32;
+static const int cmp_chunk_f32 = 256;
+static const int dec_chunk_f32 = 256;
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_F32_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
new file mode 100644
index 00000000..d1cc1b43
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_f64.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_F64_H
+#define CUSZP_INCLUDE_CUSZP_F64_H
+
+static const int cmp_tblock_size_f64 = 32;
+static const int dec_tblock_size_f64 = 32;
+static const int cmp_chunk_f64 = 8192;
+static const int dec_chunk_f64 = 8192;
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_F64_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
new file mode 100644
index 00000000..2777a919
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_timer.h
@@ -0,0 +1,31 @@
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
+#endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
new file mode 100644
index 00000000..ae9b3b60
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/include/cuSZp_utility.h
@@ -0,0 +1,18 @@
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+void symTransform_8bytes(unsigned char data[8]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
+#endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
new file mode 100644
index 00000000..59749099
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f32.cu
@@ -0,0 +1,149 @@
+#include "cuSZp_entry_f32.h"
+#include "cuSZp_f32.h"
+#include <stdio.h>
+
+void SZp_compress_hostptr_f32(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f32(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f32(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    int bsize = cmp_tblock_size_f32;
+    int gsize = (nbEle + bsize * cmp_chunk_f32 - 1) / (bsize * cmp_chunk_f32);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f32;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    // cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(float));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f32(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f32;
+    int gsize = (nbEle + bsize * dec_chunk_f32 - 1) / (bsize * dec_chunk_f32);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f32<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
new file mode 100644
index 00000000..926406c2
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_entry_f64.cu
@@ -0,0 +1,149 @@
+#include "cuSZp_entry_f64.h"
+#include "cuSZp_f64.h"
+
+void SZp_compress_hostptr_f64(double* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_oriData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(double)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr_f64(double* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, double errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    double* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(double)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(double)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(double)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(double)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr_f64(double* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size_f64;
+    int gsize = (nbEle + bsize * cmp_chunk_f64 - 1) / (bsize * cmp_chunk_f64);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk_f64;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMallocManaged((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_oriData + nbEle, 0, (pad_nbEle - nbEle) * sizeof(double));
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Obtain compression ratio and move data back to CPU.  
+    *cmpSize = (size_t)d_cmpOffset[cmpOffSize-1] + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr_f64(double* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, double errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size_f64;
+    int gsize = (nbEle + bsize * dec_chunk_f64 - 1) / (bsize * dec_chunk_f64);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel_f64<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
new file mode 100644
index 00000000..1f18bfc0
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f32.cu
@@ -0,0 +1,335 @@
+#include "cuSZp_f32.h"
+
+__device__ inline int quantization_f32(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f32(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f32/32;
+    const int start_idx = idx * cmp_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f32];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f32;
+            currQuant = i > nbEle ? 0 : quantization_f32(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f32(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f32/32;
+    const int start_idx = idx * dec_chunk_f32;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                if(temp_start_idx+i < nbEle){
+                    decData[temp_start_idx+i] = currQuant * eb * 2;
+                }
+                prevQuant = currQuant;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
new file mode 100644
index 00000000..30cdfbff
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_f64.cu
@@ -0,0 +1,333 @@
+#include "cuSZp_f64.h"
+
+__device__ inline int quantization_f64(double data, double recipPrecision)
+{
+    double dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5?0:1;
+    return (int)(dataRecip+0.5) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel_f64(const double* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk_f64/32;
+    const int start_idx = idx * cmp_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const double recipPrecision = 0.5/eb;
+
+    int temp_start_idx, temp_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk_f64];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        sign_flag[j] = 0;
+        temp_start_idx = start_idx + j*32;
+        temp_end_idx = temp_start_idx + 32;
+        block_idx = start_block_idx+j;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        for(int i=temp_start_idx; i<temp_end_idx; i++)
+        {
+            quant_chunk_idx = i%cmp_chunk_f64;
+            currQuant = quantization_f64(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            if(warp==gridDim.x-1) cmpOffset[warp+1] += cmpOffset[warp];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+        
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+    
+    for(int j=0; j<block_num; j++)  
+    {
+        int chunk_idx_start = j*32;
+
+        if(fixed_rate[j])
+        {
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+    }
+}
+
+
+__global__ void SZp_decompress_kernel_f64(double* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const double eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int idx = blockIdx.x * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk_f64/32;
+    const int start_idx = idx * dec_chunk_f64;
+    const int start_block_idx = start_idx/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int temp_start_idx;
+    int block_idx;
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = start_block_idx + j;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+    }
+    __syncthreads();
+
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        __threadfence();
+        if(warp==0)
+        {
+            flag[1] = 2;
+            __threadfence();
+        }
+        else
+        {
+            flag[warp+1] = 1;
+            __threadfence();
+        }
+    }
+    __syncthreads();
+
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    __syncthreads();
+
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    unsigned int prev_thread = __shfl_up_sync(0xffffffff, thread_ofs, 1);
+    unsigned int cmp_byte_ofs;
+    if(!lane) cmp_byte_ofs = base_idx;
+    else cmp_byte_ofs = base_idx + prev_thread / 8;
+
+    for(int j=0; j<block_num; j++)
+    {
+        temp_start_idx = start_idx + j*32;
+        unsigned int sign_flag = 0;
+
+        if(fixed_rate[j])
+        {
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[temp_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+    }
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
new file mode 100644
index 00000000..5148af98
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_timer.cu
@@ -0,0 +1,31 @@
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
new file mode 100644
index 00000000..d72c17a0
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp/src/cuSZp_utility.cu
@@ -0,0 +1,614 @@
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZ2, QCAT, and SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+
+typedef union lldouble
+{
+    double value;
+    uint64_t lvalue;
+    unsigned char byte[8];
+} lldouble;
+
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Reverse 8-bit-length unsigned char array.
+ * 
+ * @param   data[8]         8-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+
+    double *daBuf = (double *)malloc(inSize);
+
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+/** ************************************************************************
+ * @brief Read double data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of double array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           double array with length nbEle
+ * *********************************************************************** */
+double *readDoubleData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		double *daBuf = readDoubleData_systemEndian_Yafan(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Write double data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeDoubleData_inBytes_Yafan(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0;
+	int state = SZ_SCES;
+	lldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
+}
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/cuSZp_interface.cpp b/qtensor/compression/cuszp/cuSZp_interface.cpp
new file mode 100644
index 00000000..5d241b18
--- /dev/null
+++ b/qtensor/compression/cuszp/cuSZp_interface.cpp
@@ -0,0 +1,137 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+// #include <cuSZp/cuSZp_entry_f32.h>
+// #include <cuSZp/cuSZp_timer.h>
+// #include <cuSZp/cuSZp_utility.h>
+#include <cuSZp_entry_f32.h>
+#include <cuSZp_timer.h>
+#include <cuSZp_utility.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+torch::Tensor compress(torch::Tensor input, float error_bound,
+                       std::string mode) {
+  CHECK_INPUT(input);
+  // Get the input tensor's data pointer and size
+  float *d_input_data = input.data_ptr<float>();
+  int64_t num_elements = input.numel();
+  size_t compressed_size = 0;
+
+  // Cuda allocate memory for the compressed output
+  unsigned char *d_compressed_data;
+  cudaMalloc((void **)&d_compressed_data, num_elements * sizeof(float));
+  cudaMemset(d_compressed_data, 0, num_elements * sizeof(float));
+  printf("f ptr %p\n", d_input_data);
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  // Just a warmup.
+  SZp_compress_deviceptr_f32(d_input_data, d_compressed_data, num_elements,
+                             &compressed_size, error_bound, stream);
+  // Ensure on a 4096 boundary
+  // compressed_size = (compressed_size + 4095) / 4096 * 4096;
+  // Create a new tensor on the GPU from the compressed output
+  
+  cudaStreamSynchronize(stream);
+  
+  cudaError_t err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+ // torch::Tensor test_t = torch::zeros(5);
+  err = cudaGetLastError();
+  printf("after comp\n");
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+
+  torch::Tensor output = torch::empty(
+      {compressed_size}, torch::TensorOptions()
+                             .dtype(torch::kUInt8)
+                             .device(torch::kCUDA)
+			     .layout(at::kStrided)
+                             .memory_format(torch::MemoryFormat::Contiguous));
+  // write from d_compressed_data
+  cudaMemcpy(output.data_ptr<unsigned char>(), d_compressed_data,
+             compressed_size, cudaMemcpyDeviceToDevice);
+  // Sync free
+  cudaStreamSynchronize(stream);
+
+  printf("after comp2\n");
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+
+  // cudaMemGetInfo(&free_byte, &total_byte);
+  // printf("GPU memory usage before output: used = %f, free = %f MB, total = %f
+  // MB\n",
+  //       (double)(total_byte - free_byte) / 1024.0 / 1024.0, (double)free_byte
+  //       / 1024.0 / 1024.0, (double)total_byte / 1024.0 / 1024.0);
+  cudaFree(d_compressed_data);
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(output);
+  return output;
+}
+
+torch::Tensor decompress(torch::Tensor compressed_data, int64_t num_elements,
+                         size_t compressed_size, float error_bound,
+                         std::string mode) {
+  CHECK_INPUT(compressed_data);
+  // Get the input tensor's data pointer and size
+  unsigned char *d_compressed_data = compressed_data.data_ptr<unsigned char>();
+
+  // torch::Tensor decompressed_data = torch::empty(
+  //     , torch::TensorOptions()
+  //                         .dtype(torch::kFloat32)
+  //                         .device(torch::kCUDA)
+  //                         .memory_format(torch::MemoryFormat::Contiguous));
+  torch::Tensor decompressed_data = torch::zeros(
+      {num_elements}, torch::TensorOptions()
+                          .dtype(torch::kFloat32)
+                          .device(torch::kCUDA)
+                          .memory_format(torch::MemoryFormat::Contiguous));
+  float *d_decompressed_data = decompressed_data.data_ptr<float>();
+
+  // Initializing CUDA Stream.
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  SZp_decompress_deviceptr_f32(d_decompressed_data, d_compressed_data,
+                               num_elements, compressed_size, error_bound,
+                               stream);
+  cudaStreamSynchronize(stream);
+  // Check cuda errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+  cudaStreamDestroy(stream);
+  CHECK_INPUT(decompressed_data);
+  return decompressed_data;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("compress", &compress, "Compress a PyTorch tensor using cuSZp");
+  m.def("decompress", &decompress, "Decompress a PyTorch tensor using cuSZp");
+}
diff --git a/qtensor/compression/cuszp/cuszp_wrapper.py b/qtensor/compression/cuszp/cuszp_wrapper.py
new file mode 100644
index 00000000..6d0f8ff4
--- /dev/null
+++ b/qtensor/compression/cuszp/cuszp_wrapper.py
@@ -0,0 +1,113 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+#from qtensor.tools.lazy_import import cupy as cp
+import cupy as cp
+import time
+import torch
+
+import cuszp
+
+from pathlib import Path
+
+def cuszp_device_compress(oriData, absErrBound,threshold):
+
+    oriData = oriData.flatten()
+    x = torch.as_tensor(oriData, device='cuda')
+    
+    ori_real = x.real
+    ori_imag = x.imag
+    x = x.contiguous()
+    x = torch.cat((ori_real, ori_imag))
+    x = torch.flatten(x)
+    bitmap = None
+    d = torch.max(x) - torch.min(x)
+    d = d.item()
+    absErrBound = float(absErrBound*(d))
+    threshold = threshold*(d)
+    truth_values = torch.abs(x)<=threshold
+    x[truth_values] = 0.0
+
+    o_bytes = cuszp.compress(x, absErrBound, "rel")
+    outSize = o_bytes.numel()*o_bytes.element_size()
+
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes):
+
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+
+    newData = cuszp.decompress(
+        cmpBytes,
+        nbEle,
+        cmpBytes.numel()*cmpBytes.element_size(),
+        absErrBound,
+        "rel",
+    )
+
+    arr = cp.asarray(newData)
+    res = arr
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
+
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    #cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024*64)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.01
+    r2r_error = 0.01
+    ranga_vr = RANGE
+    in_vector = np.zeros((DATA_SIZE,))
+    for i in range(0,int(DATA_SIZE/4)):
+        in_vector[i] = 0.0
+    for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+        in_vector[i] = 5.0
+    for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+        in_vector[i] = random.uniform(MIN_D, MAX_D)
+    for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+        in_vector[i] = -7.0
+    for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+        in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('complex64')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    #in_vector_gpu = cp.asarray(in_vector)
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(2):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize)
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE*2, o_bytes)
+        #free_compressed(o_bytes[0])
+        #cp.cuda.runtime.free(d_bytes)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/cuszp/gnncuszp.py b/qtensor/compression/cuszp/gnncuszp.py
new file mode 100644
index 00000000..76bd8197
--- /dev/null
+++ b/qtensor/compression/cuszp/gnncuszp.py
@@ -0,0 +1,347 @@
+import cuszp
+import torch
+from statcollector import StatCollector
+# Create a class that performs compression and decompression on a tensor
+
+
+class Compressor(torch.nn.Module):
+    def __init__(self, err_mode, err_bound, device, num_nodes,statcollector:StatCollector):
+        super(Compressor, self).__init__()
+        self.err_mode = err_mode
+        self.err_bound = err_bound
+        self.device = device
+        self.compressor = cuszp
+        self.num_nodes = num_nodes
+        self.sc = statcollector
+
+    def compress(self, x):
+        # Ensure float32 type
+        if not x.dtype == torch.float32:
+            raise TypeError("x must be of type torch.float32")
+        x = x.contiguous()
+        if self.err_mode == "rel" or self.err_mode == "relative":
+            # Value-range error bound
+            x_max = torch.max(x)
+            x_min = torch.min(x)
+            # Compute the err_bound
+            err_bound = (x_max - x_min) * self.err_bound
+            # print("min =", x_min, "max =", x_max, "err_bound =", err_bound)
+            self.sc.add_tensor_stat("Min Value", x_min.item())
+            self.sc.add_tensor_stat("Max Value", x_max.item())
+
+        elif self.err_mode == "abs" or self.err_mode == "absolute":
+            err_bound = self.err_bound
+        else:
+            raise ValueError("err_mode must be 'rel / relative' or 'abs / absolute'")
+        self.sc.add_tensor_stat("Absolute Error Bound", err_bound.item())
+
+        return CompressedElement(x, self.compressor.compress(x, err_bound, self.err_mode), err_bound, self.device)
+
+    def decompress(self, comp_element):
+        if not isinstance(comp_element, CompressedElement):
+            raise TypeError("comp_element must be an instance of CompressedElement")
+        compressed_size = (
+            comp_element.compressed_data.numel()
+            * comp_element.compressed_data.element_size()
+        )
+        decompressed = self.compressor.decompress(
+            comp_element.compressed_data,
+            comp_element.uncompressed_elements,
+            compressed_size,
+            comp_element.err_bound,
+            self.err_mode,
+        )
+        # Reshape decompressed to match original shape
+        decompressed = decompressed.reshape(comp_element.original_shape)
+        return decompressed
+
+    def pack_hook(self, x):
+        if (
+            x.dtype == torch.float32
+            and x.requires_grad
+            and not x.is_sparse
+            and isinstance(x, torch.Tensor)
+            and x.shape[0] == self.num_nodes
+        ):
+            # print("Packing", x.shape)
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            compressed = self.compress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Compression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Uncompressed size =", (x.numel() * x.element_size()) / 1024 / 1024)
+            # print(
+            #     "Compressed size =",
+            #     (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     )
+            #     / 1024
+            #     / 1024,
+            # )
+            # print(
+            #     "Compression Ratio = ",
+            #     (x.numel() * x.element_size())
+            #     / (
+            #         compressed.compressed_data.numel()
+            #         * compressed.compressed_data.element_size()
+            #     ),
+            # )
+            csize = compressed.compressed_data.numel()*compressed.compressed_data.element_size()
+            osize = x.numel() * x.element_size()
+            self.sc.add_tensor_stat("Uncompressed Size (bytes)", osize)
+            self.sc.add_tensor_stat("Compressed Size (bytes)", csize)
+            self.sc.increment_epoch_stat("Average CR", osize/csize)
+            self.sc.increment_epoch_stat("Aggregate Uncompressed Tensor Size (bytes)", osize)
+            self.sc.increment_epoch_stat("Aggregate Compressed Tensor Size (bytes)", csize)
+            # print( "Data Saved", ((x.numel() * x.element_size()) - (compressed.compressed_data.numel() * compressed.compressed_data.element_size()))/1024/1024)
+            # print("Testing decompress,", decompressed)
+            # print("Compressed data", compressed.compressed_data)
+            # print("Decompressed shape =", decompressed.shape)
+            # print("X shape = ", x.shape)
+            # abs_error = torch.abs(x - decompressed)
+            # max_error = torch.max(abs_error)
+            # if max_error > self.err_bound * 1.1:
+            #     # Print the location of the max error and the values
+            #     print("Max error location =", torch.argmax(torch.abs(x - decompressed)))
+            #     print("Max error value =", max_error)
+            #     location = torch.argmax(torch.abs(x - decompressed))
+            #     # Print row and column of max error
+            #     print("Row =", int(location / x.shape[1]))
+            #     print("Column =", location % x.shape[1])
+            #     # Count the number of elements that are > self.err_bound * 1.1
+            #     bound_err_cnt = torch.sum(abs_error > self.err_bound * 1.1)
+            #     print("Number of elements > err_bound * 1.1 =", bound_err_cnt)
+            #     print("X value =", x[int(location / x.shape[1])][location % x.shape[1]])
+            #     print(
+            #         "Decompressed value =",
+            #         decompressed[int(location / x.shape[1])][location % x.shape[1]],
+            #     )
+            #     raise ValueError(
+            #         "Error bound exceeded! Max error = ", max_error
+            #     )
+            # # Ensure max_error <= err_bound
+
+            # print("Max error =", max_error)
+            # Ensure x is freed
+            # delete x
+            self.sc.increment_epoch_stat("Compressed Tensor Count",1)
+            self.sc.register_tensor_row_and_update()
+
+
+            del x
+            # empty cache
+            torch.cuda.empty_cache()
+            return compressed
+        else:
+            return x
+
+    def unpack_hook(self, x):
+        if isinstance(x, CompressedElement):
+            # print("Unpacking", x.name)
+            # print("Unpacking")
+            t0 = self.sc.new_clock()
+            self.sc.sync_start_time(t0)
+
+            decompressed = self.decompress(x)
+
+            self.sc.sync_end_time(t0)
+            self.sc.increment_epoch_stat("Total Decompression Time (s)",self.sc.get_elapsed_time(t0))
+
+            # print("Unpacked")
+            # print("Unpacked to", decompressed)
+            return decompressed
+        else:
+            return x
+
+
+# Create class for a compressed element that is used by the Compressor class
+
+
+class CompressedElement(torch.nn.Module):
+    def __init__(self, x, compressed, err_bound, device):
+        super(CompressedElement, self).__init__()
+        self.device = device
+        # self.compressor = cuszp
+        self.compressed_data = compressed
+        self.uncompressed_elements = x.numel()
+        self.original_shape = x.shape
+        self.err_bound = err_bound
+
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+
+    scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
\ No newline at end of file
diff --git a/qtensor/compression/cuszp/setup.py b/qtensor/compression/cuszp/setup.py
new file mode 100644
index 00000000..33ab8839
--- /dev/null
+++ b/qtensor/compression/cuszp/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup, Extension
+from torch.utils import cpp_extension
+import os
+
+cuSZp_install = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp')
+cuSZp_include = os.path.join(cuSZp_install, 'include')
+cuSZp_src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cuSZp', 'src')
+# Retrieve list of source files
+cuSZp_src_files = []
+for root, dirs, files in os.walk(cuSZp_src):
+    for file in files:
+        if file.endswith('.cu'):
+            cuSZp_src_files.append(os.path.join(root, file))
+cuSZp_src_files.append('cuSZp_interface.cpp')
+
+# define the extension module
+cuSZp_extension = cpp_extension.CUDAExtension(
+    name='cuszp',
+    sources=cuSZp_src_files,
+    include_dirs=[cuSZp_include],
+)
+
+# build the extension module
+setup(
+    name='cuszp',
+    ext_modules=[cuSZp_extension],
+    cmdclass={'build_ext': cpp_extension.BuildExtension}
+)
diff --git a/qtensor/compression/newsz/README_NEWSZ.md b/qtensor/compression/newsz/README_NEWSZ.md
new file mode 100644
index 00000000..e6dbcda2
--- /dev/null
+++ b/qtensor/compression/newsz/README_NEWSZ.md
@@ -0,0 +1,15 @@
+# Building newSZ
+
+1. Clone the NVCOMP repository from https://github.com/NVIDIA/nvcomp.git
+
+2. Change to 'branch-2.2' branch. (`git checkout branch-2.2`)
+
+3. Follow build instructions in NVCOMP repository (you can ignore -DNVCOMP_EXTS_ROOT flag)
+
+4. Copy shared library `nvcomp/build/lib/libnvcomp.so` to current directory (`qtensor/compression/newsz/`)
+
+5. Run the following command: `nvcc --shared --compiler-options '-fPIC' -lnvcomp -o libnewsz_wrapper.so *.cu --library-path=<PATH_TO_NVCOMP_LIB> --library=nvcomp -I/PATH_TO_NVCOMP/nvcomp/build/include/`
+
+# Running newSZ
+
+- Specify --compress=newsz when running main.py
diff --git a/qtensor/compression/newsz/newsz.cu b/qtensor/compression/newsz/newsz.cu
new file mode 100644
index 00000000..00a394b6
--- /dev/null
+++ b/qtensor/compression/newsz/newsz.cu
@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include "newsz.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+// #include "cuCompactor.cuh"
+
+#include "nvcomp/lz4.hpp"
+#include "nvcomp.hpp"
+#include "nvcomp/nvcompManagerFactory.hpp"
+
+#define BLKS 40
+#define THDS 128
+#define FULL_MASK 0xffffffff
+
+__device__ int g_ints;
+
+struct int_predicate
+{
+    
+	__host__ __device__
+	bool operator()(const int x)
+	{
+		return x>0;
+	}
+};
+
+struct to_copy
+{
+  __host__ __device__
+  bool operator()(const uint8_t x)
+  {
+    return x==1;
+  }
+};
+
+
+
+
+__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    float item = data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max());
+    float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min());
+    
+    if (threadIdx.x==0)
+    {
+        max = tmax;
+        min = tmin;
+    }
+
+    __syncthreads();
+
+    float vrange = max - min;
+    float scale = vrange/((2^8) - 1);
+    int zeropt = -1*lrintf(min*scale) - (2^7);
+
+    int q_item = lrintf(item/scale) + zeropt;
+
+    // Clamp quantized value
+    if(q_item>127)q_item = 127;
+    if(q_item <-128)q_item = -128;
+    int8_t q_val = (int8_t)(0xff & q_item);
+    out[blockIdx.x*blockDim.x+threadIdx.x] = q_val;
+    if (threadIdx.x==0)
+    {
+        scales[blockIdx.x] = scale;
+        zeropts[blockIdx.x]= zeropt;
+    }
+    
+}
+
+__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){
+    int bid = blockIdx.x;
+    int tid = threadIdx.x;
+    extern __shared__ float scratchpad[];
+    __shared__ float min;
+    __shared__ float max;
+
+    typedef cub::BlockReduce<float, THDS> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage1;
+
+    int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x];
+
+    out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid];
+}
+
+__global__ void p_ints(){
+	printf("codebook entries used: %d\n", g_ints);
+}
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){
+    float *scales, *zeropts;
+    int8_t *q_out;
+    unsigned char *cmpbytes;
+    int num_blocks = num_elements/blocksize;
+
+    cudaMalloc(&scales, sizeof(float)*num_blocks);
+    cudaMalloc(&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc(&q_out, num_elements);
+
+    using namespace nvcomp;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+     
+
+    compress<<<num_blocks, blocksize>>>(data, scales, zeropts, q_out);
+    cudaDeviceSynchronize();
+
+    LZ4Manager nvcomp_manager{chunk_size, data_type, stream};
+    CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements);
+
+    uint8_t* comp_buffer;
+    cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size);
+    
+    nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config);
+
+    size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer);
+    cudaFree(q_out);
+
+    *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size;
+    cudaMalloc(&cmpbytes, *outsize);
+
+    cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice);
+
+    float h_firstscale;
+    cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(comp_buffer);
+    return cmpbytes;
+}
+
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){
+    float *scales, *zeropts;
+    uint8_t *q_cmp;
+    int8_t *q_vals;
+    float *out;
+    int num_blocks = num_elements/blocksize;
+    size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks);
+
+    float first_val, *d_first;
+
+    cudaMalloc(&d_first, sizeof(float));
+    cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost);
+
+
+
+    cudaMalloc((void **)&scales, sizeof(float)*num_blocks);
+    cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks);
+    cudaMalloc((void **)&q_cmp, c_size);
+    cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
+    
+    cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice);
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int chunk_size = 1 << 16;
+    
+    
+    nvcompType_t data_type = NVCOMP_TYPE_CHAR;
+
+    auto decomp_manager = nvcomp::create_manager(q_cmp, stream);
+
+    nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp);
+    cudaMalloc(&q_vals, num_elements);
+
+    decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config);
+    cudaFree(q_cmp);
+
+    cudaMalloc(&out, sizeof(float)*num_elements);
+
+    decompress<<<num_blocks, blocksize>>>(q_vals, scales, zeropts, out);
+    cudaDeviceSynchronize();
+    
+    cudaFree(scales);
+    cudaFree(zeropts);
+    cudaFree(q_vals);
+
+    return out;
+}
+
+int main(int argc, char** argv){
+    char oriFilePath[640], outputFilePath[645];
+    float* data;
+    size_t nbEle;
+    if(argc < 3)
+    {
+		printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n");
+		printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n");
+		exit(0);
+    }
+
+    sprintf(oriFilePath, "%s", argv[1]);
+    int blockSize = atoi(argv[2]);
+    float errBound = atof(argv[3]);
+    nbEle = atoi(argv[4]);
+
+    data = (float*)malloc(sizeof(float)*nbEle);
+    sprintf(outputFilePath, "%s.sznew", oriFilePath);
+
+    FILE *in_file;
+    in_file = fopen(oriFilePath, "rb");
+    
+    fread(data, sizeof(float), nbEle, in_file);
+    fclose(in_file);
+    
+    float max = data[0];
+    float min = data[0];
+    for(int i=0;i<nbEle;i++){
+	if(data[i]>=max){
+		max = data[i];
+	}
+	if(data[i]<=min){
+		min = data[i];
+	}
+    }
+    errBound = errBound*(max-min);
+
+    // Move to device
+    float *d_data;
+    unsigned char *cmpbytes;
+    size_t outsize;
+    cudaMalloc(&d_data, sizeof(float)*nbEle);
+    cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice);
+    //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize);
+
+    cudaFree(d_data);
+    
+}
diff --git a/qtensor/compression/newsz/newsz.h b/qtensor/compression/newsz/newsz.h
new file mode 100644
index 00000000..1022e20a
--- /dev/null
+++ b/qtensor/compression/newsz/newsz.h
@@ -0,0 +1,3 @@
+
+unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize);
+float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize);
diff --git a/qtensor/compression/newsz/newsz_wrapper.cu b/qtensor/compression/newsz/newsz_wrapper.cu
new file mode 100644
index 00000000..a692af9b
--- /dev/null
+++ b/qtensor/compression/newsz/newsz_wrapper.cu
@@ -0,0 +1,21 @@
+#include "newsz.h"
+#include <stdio.h>
+
+extern "C"{
+    
+    unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){
+        //unsigned char* cmpbytes;
+        return SZ_device_compress(oriData, nbEle, blockSize, outSize);
+        //printf("in wrap cmpbytes: %p\n", cmpbytes);
+	//return cmpbytes;
+    }
+
+    float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){
+        size_t *cmpsize_ptr;
+        *cmpsize_ptr = cmpsize;
+
+        float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr);
+	return res;
+    }
+    
+}
diff --git a/qtensor/compression/newsz/newsz_wrapper.py b/qtensor/compression/newsz/newsz_wrapper.py
new file mode 100644
index 00000000..4cbc2692
--- /dev/null
+++ b/qtensor/compression/newsz/newsz_wrapper.py
@@ -0,0 +1,161 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so')
+ 
+NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so')
+#NVCOMP_PATH= './libnvcomp.so'
+#LIB_PATH = './libnewsz_wrapper.so'
+
+# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+def get_device_compress():
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_compress
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+def get_device_decompress():
+
+    dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL)
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.newSZ_device_decompress
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def newsz_device_compress(oriData, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    d = cp.amax(oriData) - cp.amin(oriData)
+    d = d.get()
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize)
+    o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize))
+    #print("testing")
+    #print(o_bytes.value)
+    return (o_bytes,outSize.contents.value, blockSize), outSize
+
+
+def newsz_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, cmpsize, blockSize) = cmpBytes
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize)
+    newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize))
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    # res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+    # cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = arr[0:int(nbEle/2)]
+    c_res.imag = arr[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    #print(in_vector[0:16])
+    for i in range(200):
+        s_time = time.time()
+        #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+
+        o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/newsz/nvcomp b/qtensor/compression/newsz/nvcomp
new file mode 160000
index 00000000..a6e4e64a
--- /dev/null
+++ b/qtensor/compression/newsz/nvcomp
@@ -0,0 +1 @@
+Subproject commit a6e4e64a177e07cd2e5c8c5e07bb66ffefceae84
diff --git a/qtensor/compression/pytest.ini b/qtensor/compression/pytest.ini
new file mode 100644
index 00000000..c24fe5bb
--- /dev/null
+++ b/qtensor/compression/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning
diff --git a/qtensor/compression/szp/include/cuSZp.h b/qtensor/compression/szp/include/cuSZp.h
new file mode 100644
index 00000000..d94e2943
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_H
+#define CUSZP_INCLUDE_CUSZP_H
+
+static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int dec_tblock_size = 32; // 32 should be the best, not need to modify.
+static const int cmp_chunk = 8192;
+static const int dec_chunk = 8192;
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle);
+
+#endif // CUSZP_INCLUDE_CUSZP_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_entry.h b/qtensor/compression/szp/include/cuSZp_entry.h
new file mode 100644
index 00000000..fcdcb420
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_entry.h
@@ -0,0 +1,12 @@
+#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H
+#define CUSZP_INCLUDE_CUSZP_ENTRY_H
+
+#include <cuda_runtime.h>
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound);
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound);
+extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0);
+extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0);
+
+#endif // CUSZP_INCLUDE_CUSZP_ENTRY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_timer.h b/qtensor/compression/szp/include/cuSZp_timer.h
new file mode 100644
index 00000000..2777a919
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_timer.h
@@ -0,0 +1,31 @@
+#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H
+#define CUSZP_INCLUDE_CUSZP_TIMER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+
+        void StartCounterFlags();
+
+        float GetCounter();
+
+};
+
+#endif // CUSZP_INCLUDE_CUSZP_TIMER_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/include/cuSZp_utility.h b/qtensor/compression/szp/include/cuSZp_utility.h
new file mode 100644
index 00000000..32af7040
--- /dev/null
+++ b/qtensor/compression/szp/include/cuSZp_utility.h
@@ -0,0 +1,14 @@
+#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H
+#define CUSZP_INCLUDE_CUSZP_UTILITY_H
+
+void symTransForm_4Bytes(unsigned char data[4]);
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status);
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status);
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status);
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2);
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0);
+double *computePSNR(size_t nbEle, float *ori_data, float *data);
+
+#endif // CUSZP_INCLUDE_CUSZP_UTILITY_H
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp.cu b/qtensor/compression/szp/src/cuSZp.cu
new file mode 100644
index 00000000..f506ee97
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp.cu
@@ -0,0 +1,393 @@
+#include "cuSZp.h"
+
+__device__ inline int quantization(float data, float recipPrecision)
+{
+    float dataRecip = data*recipPrecision;
+    int s = dataRecip>=-0.5f?0:1;
+    return (int)(dataRecip+0.5f) - s;
+}
+
+
+__device__ inline int get_bit_num(unsigned int x)
+{
+    return (sizeof(unsigned int)*8) - __clz(x);
+}
+
+
+__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = cmp_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+    const float recipPrecision = 0.5f/eb;
+
+    int base_start_idx;
+    int base_block_start_idx, base_block_end_idx;
+    int quant_chunk_idx;
+    int block_idx;
+    int currQuant, lorenQuant, prevQuant, maxQuant;
+    int absQuant[cmp_chunk];
+    unsigned int sign_flag[block_num];
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory.
+    base_start_idx = warp * cmp_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initilization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        base_block_end_idx = base_block_start_idx + 32;
+        sign_flag[j] = 0;
+        block_idx = base_block_start_idx/32;
+        prevQuant = 0;
+        maxQuant = 0;
+
+        // Operation for each block
+        for(int i=base_block_start_idx; i<base_block_end_idx; i++)
+        {
+            // Get quantization and Lorenzo prediction
+            quant_chunk_idx = j * 32 + i % 32;
+            currQuant = quantization(oriData[i], recipPrecision);
+            lorenQuant = currQuant - prevQuant;
+            prevQuant = currQuant;
+            // Get and combine sign info.
+            sign_ofs = i % 32;
+            sign_flag[j] |= (lorenQuant < 0) << (31 - sign_ofs);
+            // Get absolute quant.
+            absQuant[quant_chunk_idx] = abs(lorenQuant);
+            // Update max quant.
+            maxQuant = maxQuant > absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx];
+        }
+
+        // Record block info.
+        fixed_rate[j] = get_bit_num(maxQuant);
+        thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        // Write block fixed rate to compressed data.
+        if(block_idx<rate_ofs) cmpData[block_idx] = (unsigned char)fixed_rate[j];
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+    
+    // Assigning compression bytes by given prefix-sum results.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Bit shuffle for each index, also storing data to global memory.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    for(int j=0; j<block_num; j++)
+    {
+        int chunk_idx_start = j*32;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Assign sign information for one block.
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16);
+            cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8);
+            cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j];
+
+            // Assign quant bit information for one block by bit-shuffle.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            int mask = 1;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = 0;
+                tmp_char1 = 0;
+                tmp_char2 = 0;
+                tmp_char3 = 0;
+
+                // Get ith bit in 0~7 quant, and store to tmp_char0.
+                tmp_char0 = (((absQuant[chunk_idx_start+0] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+7] & mask) >> i) << 0);
+
+                // Get ith bit in 8~15 quant, and store to tmp_char1.
+                tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+15] & mask) >> i) << 0);
+
+                // Get ith bit in 16~23 quant, and store to tmp_char2.
+                tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+23] & mask) >> i) << 0);
+                
+                // Get ith bit in 24-31 quant, and store to tmp_char3.
+                tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) |
+                            (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) |
+                            (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) |
+                            (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) |
+                            (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) |
+                            (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) |
+                            (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) |
+                            (((absQuant[chunk_idx_start+31] & mask) >> i) << 0);
+
+                // Move data to global memory.
+                cmpData[cmp_byte_ofs++] = tmp_char0;
+                cmpData[cmp_byte_ofs++] = tmp_char1;
+                cmpData[cmp_byte_ofs++] = tmp_char2;
+                cmpData[cmp_byte_ofs++] = tmp_char3;
+                mask <<= 1;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
+}
+
+
+
+__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle)
+{
+    __shared__ unsigned int base_idx;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int idx = bid * blockDim.x + tid;
+    const int lane = idx & 31;
+    const int warp = idx >> 5;
+    const int block_num = dec_chunk/32;
+    const int rate_ofs = (nbEle+31)/32;
+
+    int base_start_idx;
+    int base_block_start_idx;
+    int block_idx;    
+    int absQuant[32];
+    int currQuant, lorenQuant, prevQuant;
+    int sign_ofs;
+    int fixed_rate[block_num];
+    unsigned int thread_ofs = 0;
+
+    // Obtain fixed rate information for each block.
+    for(int j=0; j<block_num; j++)
+    {
+        block_idx = warp * dec_chunk + j * 32 + lane;
+        if(block_idx<rate_ofs) 
+        {
+            fixed_rate[j] = (int)cmpData[block_idx];
+            thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0;
+        }
+        __syncthreads();
+    }
+
+    // Warp-level prefix-sum (inclusive), also thread-block-level.
+    for(int i=1; i<32; i<<=1)
+    {
+        int tmp = __shfl_up_sync(0xffffffff, thread_ofs, i);
+        if(lane >= i) thread_ofs += tmp;
+    }
+    __syncthreads();
+
+    // Write warp(i.e. thread-block)-level prefix-sum to global-memory.
+    if(lane==31) 
+    {
+        cmpOffset[warp+1] = (thread_ofs+7)/8;
+        if(warp==0)
+            flag[1] = 2;
+        else
+            flag[warp+1] = 1;
+    }
+    __syncthreads();
+
+    // Global-level prefix-sum (exclusive).
+    if(warp>0)
+    {
+        if(!lane)
+        {
+            int temp_flag = 1;
+            while(temp_flag!=2) temp_flag = flag[warp];
+            __threadfence();
+            cmpOffset[warp] += cmpOffset[warp-1];
+            __threadfence();
+            flag[warp+1] = 2;
+        }
+    }
+    else
+    {
+        if(!lane) cmpOffset[0] = 0;
+    }
+    __syncthreads();
+
+    // Retrieving compression bytes and reconstruct decompression data.
+    if(!lane) base_idx = cmpOffset[warp] + rate_ofs;
+    __syncthreads();
+
+    // Restore bit-shuffle for each block.
+    unsigned int base_cmp_byte_ofs = base_idx;
+    unsigned int cmp_byte_ofs;
+    unsigned int tmp_byte_ofs = 0;
+    unsigned int cur_byte_ofs = 0;
+    base_start_idx = warp * dec_chunk * 32;
+    for(int j=0; j<block_num; j++)
+    {
+        // Block initialization.
+        base_block_start_idx = base_start_idx + j * 1024 + lane * 32;
+        unsigned int sign_flag = 0;
+
+        // Restore index for j-th iteration.
+        tmp_byte_ofs = (fixed_rate[j]) ? (4+fixed_rate[j]*4) : 0;
+        for(int i=1; i<32; i<<=1)
+        {
+            int tmp = __shfl_up_sync(0xffffffff, tmp_byte_ofs, i);
+            if(lane >= i) tmp_byte_ofs += tmp;
+        }
+        unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1);
+        if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs;
+        else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread;
+
+        // Operation for each block, if zero block then do nothing.
+        if(fixed_rate[j])
+        {
+            // Retrieve sign information for one block.
+            sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) |
+                        (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) |
+                        (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8))  |
+                        (0x000000ff & cmpData[cmp_byte_ofs++]);
+            
+            // Retrieve quant data for one block.
+            unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3;
+            for(int i=0; i<32; i++) absQuant[i] = 0;
+            for(int i=0; i<fixed_rate[j]; i++)
+            {
+                // Initialization.
+                tmp_char0 = cmpData[cmp_byte_ofs++];
+                tmp_char1 = cmpData[cmp_byte_ofs++];
+                tmp_char2 = cmpData[cmp_byte_ofs++];
+                tmp_char3 = cmpData[cmp_byte_ofs++];
+
+                // Get ith bit in 0~7 abs quant from global memory.
+                absQuant[0] |= ((tmp_char0 >> 7) & 0x00000001) << i;
+                absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i;
+                absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i;
+                absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i;
+                absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i;
+                absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i;
+                absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i;
+                absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 8~15 abs quant from global memory.
+                absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i;
+                absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i;
+                absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i;
+                absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i;
+                absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i;
+                absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i;
+                absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i;
+                absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i;
+
+                // Get ith bit in 16-23 abs quant from global memory.
+                absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i;
+                absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i;
+                absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i;
+                absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i;
+                absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i;
+                absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i;
+                absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i;
+                absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i;
+
+                // // Get ith bit in 24-31 abs quant from global memory.
+                absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i;
+                absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i;
+                absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i;
+                absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i;
+                absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i;
+                absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i;
+                absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i;
+                absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i;
+            }
+            
+            // Delorenzo and store data back to decompression data.
+            prevQuant = 0;
+            for(int i=0; i<32; i++)
+            {
+                sign_ofs = i % 32;
+                if(sign_flag & (1 << (31 - sign_ofs)))
+                    lorenQuant = absQuant[i] * -1;
+                else
+                    lorenQuant = absQuant[i];
+                currQuant = lorenQuant + prevQuant;
+                decData[base_block_start_idx+i] = currQuant * eb * 2;
+                prevQuant = currQuant;
+            }
+        }
+
+        // Index updating across different iterations.
+        cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31);
+    }
+}
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_entry.cu b/qtensor/compression/szp/src/cuSZp_entry.cu
new file mode 100644
index 00000000..e92e669a
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_entry.cu
@@ -0,0 +1,147 @@
+#include "cuSZp_entry.h"
+#include "cuSZp.h"
+
+void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * cmp_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_oriData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+    cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+    // Free memory that is used.
+    cudaFree(d_oriData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+    int pad_nbEle = gsize * bsize * dec_chunk;
+
+    // Initializing global memory for GPU compression.
+    float* d_decData;
+    unsigned char* d_cmpData;
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle);
+    cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle);
+    cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle);
+    cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // Initializing CUDA Stream.
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle);
+    
+    // Move data back to CPU.
+    cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost);
+
+    // Free memoy that is used.
+    cudaFree(d_decData);
+    cudaFree(d_cmpData);
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+    cudaStreamDestroy(stream);
+}
+
+
+void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = cmp_tblock_size;
+    int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    unsigned int glob_sync;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_compress_kernel<<<gridSize, blockSize, 0, stream>>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+    // Obtain compression ratio and move data back to CPU.  
+    cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    *cmpSize = (size_t)glob_sync + (nbEle+31)/32;
+
+    // Free memory that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
+
+
+void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream)
+{
+    // Data blocking.
+    int bsize = dec_tblock_size;
+    int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk);
+    int cmpOffSize = gsize + 1;
+
+    // Initializing global memory for GPU compression.
+    unsigned int* d_cmpOffset;
+    int* d_flag;
+    cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize);
+    cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize);
+    cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize);
+    cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize);
+    cudaMemset(d_decData, 0, sizeof(float)*nbEle);
+
+    // cuSZp GPU compression.
+    dim3 blockSize(bsize);
+    dim3 gridSize(gsize);
+    SZp_decompress_kernel<<<gridSize, blockSize, 0, stream>>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle);
+    cudaDeviceSynchronize();
+    // Free memoy that is used.
+    cudaFree(d_cmpOffset);
+    cudaFree(d_flag);
+}
diff --git a/qtensor/compression/szp/src/cuSZp_timer.cu b/qtensor/compression/szp/src/cuSZp_timer.cu
new file mode 100644
index 00000000..5148af98
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_timer.cu
@@ -0,0 +1,31 @@
+#include "cuSZp_timer.h"
+
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/szp/src/cuSZp_utility.cu b/qtensor/compression/szp/src/cuSZp_utility.cu
new file mode 100644
index 00000000..ac4006d7
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_utility.cu
@@ -0,0 +1,493 @@
+//
+// Created by Yafan Huang on 5/31/22.
+//     Copied from SZx.
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <string.h>
+#include "cuSZp_utility.h"
+
+/*Macro Definition for Processing Data*/
+// #define SZ_SCES 0  //successful
+#define RW_SCES 0
+#define RW_FERR 1
+#define RW_TERR 2
+#define LITTLE_ENDIAN_SYSTEM 0
+#define QCAT_BUFS 64
+
+/*Global Varaibles for Processing Data*/
+int dataEndianType_Yafan = 0;
+int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union llfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} llfloat;
+
+/** ************************************************************************
+ * @brief Reverse 4-bit-length unsigned char array.
+ * 
+ * @param   data[4]         4-bit-length unsigned char array.
+ * *********************************************************************** */
+void symTransForm_4Bytes(unsigned char data[4])
+{
+        unsigned char tmp = data[0];
+        data[0] = data[3];
+        data[3] = tmp;
+
+        tmp = data[1];
+        data[1] = data[2];
+        data[2] = tmp;
+}
+
+/** ************************************************************************
+ * @brief Read byte data from path to source binary format file.
+ *        Usually used for decompressing data from input file.
+ *        Variables byteLength and status can be obtained through this function.       
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   byteLength      the length of byte array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  byteBuf         unsigned char array with length byteLength
+ * *********************************************************************** */
+unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return byteBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file in endian systems.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = RW_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = RW_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = RW_SCES;
+    return daBuf;
+}
+
+/** ************************************************************************
+ * @brief Read float data from path to source binary format file.
+ *        Usually used for compressing data from input file.
+ *        Variables nbEle and status can be obtained through this function. 
+ * 
+ * @param   srcFilePath     input source file path
+ * @param   nbEle           the length of float array
+ * @param   status          data processing states (macro definitions) 
+ * 
+ * @return  daBuf           float array with length nbEle
+ * *********************************************************************** */
+float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = RW_SCES;
+	if(dataEndianType_Yafan==sysEndianType_Yafan)
+	{
+		float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state);
+		if(state == RW_FERR)
+		{
+			*status = RW_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		llfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransForm_4Bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+/** ************************************************************************
+ * @brief Write byte data to binary format file.
+ *        Usually used for writing compressed data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   byteLength      the length of unsigned char array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = RW_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = RW_SCES;
+}
+
+/** ************************************************************************
+ * @brief Write float data to binary format file.
+ *        Usually used for writing decompressed (reconstructed) data.
+ *        Variable status can be obtained/switched through this function. 
+ * 
+ * @param   bytes           unsigned char array (compressed data)
+ * @param   nbEle           the length of float array
+ * @param   tgtFilePath     output file path
+ * @param   status          data processing states (macro definitions) 
+ * *********************************************************************** */
+void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = RW_SCES;
+	llfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+// void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+// {
+// 	lint32 ls;
+// 	size_t index = 0;
+// 	size_t i;
+// 	if(sysEndianType_Yafan==dataEndianType_Yafan)
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[0];
+// 			bytes[index+1] = ls.byte[1];
+// 			bytes[index+2] = ls.byte[2];
+// 			bytes[index+3] = ls.byte[3];
+// 		}		
+// 	}
+// 	else
+// 	{
+// 		for(i=0;i<stateLength;i++)
+// 		{
+// 			index = i << 2; //==i*4
+// 			ls.ivalue = states[i];
+// 			bytes[index] = ls.byte[3];
+// 			bytes[index+1] = ls.byte[2];
+// 			bytes[index+2] = ls.byte[1];
+// 			bytes[index+3] = ls.byte[0];
+// 		}			
+// 	}
+// }
+
+// void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+// {
+// 	int state = SZ_SCES;
+// 	size_t byteLength = stateLength*4;
+// 	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+// 	convertIntArrayToBytes(states, stateLength, bytes);
+// 	writeByteData_Yafan(bytes, byteLength, tgtFilePath, &state);
+// 	free(bytes);
+// 	*status = state;
+// }
+
+
+/** ************************************************************************
+ * @brief Calculate SSIM in a small fraction of a 3D data file.
+ *        A subfunction used in computeSSIM().
+ * 
+ * @param   data            original float array
+ * @param   other           other (reconstructed) float array
+ * @param   size1           3d-ssim setting.
+ * @param   size0           3d-ssim setting.
+ * @param   offset0         3d-ssim setting.
+ * @param   offset1         3d-ssim setting.
+ * @param   offset2         3d-ssim setting.
+ * @param   windowSize0     3d-ssim setting.
+ * @param   windowSize1     3d-ssim setting.
+ * @param   windowSize2     3d-ssim setting.
+ * 
+ * @return  ssim            ssim value of the current small fraction data
+ * *********************************************************************** */
+double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2) {
+    int i0,i1,i2,index;
+    int np=0; //Number of points
+    float xMin=data[offset0+size0*(offset1+size1*offset2)];
+    float xMax=data[offset0+size0*(offset1+size1*offset2)];
+    float yMin=other[offset0+size0*(offset1+size1*offset2)];
+    float yMax=other[offset0+size0*(offset1+size1*offset2)];
+    double xSum=0;
+    double ySum=0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                np++;
+                index=i0+size0*(i1+size1*i2);
+                if(xMin>data[index])
+                    xMin=data[index];
+                if(xMax<data[index])
+                    xMax=data[index];
+                if(yMin>other[index])
+                    yMin=other[index];
+                if(yMax<other[index])
+                    yMax=other[index];
+                xSum+=data[index];
+                ySum+=other[index];
+            }
+        }
+    }
+    double xMean=xSum/np;
+    double yMean=ySum/np;
+    double var_x = 0, var_y = 0, var_xy = 0;
+    for(i2=offset2; i2<offset2+windowSize2; i2++) {
+        for(i1=offset1; i1<offset1+windowSize1; i1++) {
+            for(i0=offset0; i0<offset0+windowSize0; i0++) {
+                index=i0+size0*(i1+size1*i2);
+                var_x += (data[index] - xMean)*(data[index] - xMean);
+                var_y += (other[index] - yMean)*(other[index] - yMean);
+                var_xy += (data[index] - xMean)*(other[index] - yMean);
+            }
+        }
+    }
+    var_x /= np;
+    var_y /= np;
+    var_xy /= np;
+    double xSigma=sqrt(var_x);
+    double ySigma=sqrt(var_y);
+    double xyCov = var_xy;
+    double c1,c2;
+    if(xMax-xMin==0) {
+		/*K1==0.01, K2==0.03*/
+        c1=0.01*0.01;
+        c2=0.03*0.03;
+    } else {
+        c1=0.01*0.01*(xMax-xMin)*(xMax-xMin);
+        c2=0.03*0.03*(xMax-xMin)*(xMax-xMin);
+    }
+    double c3=c2/2;
+    double luminance=(2*xMean*yMean+c1)/(xMean*xMean+yMean*yMean+c1);
+    double contrast=(2*xSigma*ySigma+c2)/(xSigma*xSigma+ySigma*ySigma+c2);
+    double structure=(xyCov+c3)/(xSigma*ySigma+c3);
+    double ssim=luminance*contrast*structure;
+    return ssim;
+}
+
+/** ************************************************************************
+ * @brief Calculate SSIM between 3D original and decompressed (reconstructed) data.
+ *        API for computing SSIM.
+ * 
+ * @param   oriData         original float array
+ * @param   decData         decompressed (reconstructed) float array
+ * @param   size2           the 1st dim of 3D data.
+ * @param   size1           the 2nd dim of 3D data.
+ * @param   size0           the 3rd dim of 3D data. (the fastest dim)
+ * 
+ * @return  ssimSum/nw      final ssim value between oriData and decData
+ * *********************************************************************** */
+double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0)
+{
+	int windowSize0=7;
+	int windowSize1=7;
+	int windowSize2=7;
+	int windowShift0=2;
+	int windowShift1=2;
+	int windowShift2=2;
+    int offset0,offset1,offset2;
+    int nw=0; //Number of windows
+    double ssimSum=0;
+    int offsetInc0,offsetInc1,offsetInc2;
+    if(windowSize0>size0) {
+        printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0);
+    }
+    if(windowSize1>size1) {
+        printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1);
+    }
+    if(windowSize2>size2) {
+        printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2);
+    }
+    //offsetInc0=windowSize0/2;
+    //offsetInc1=windowSize1/2;
+    //offsetInc2=windowSize2/2;
+    offsetInc0=windowShift0;
+    offsetInc1=windowShift1;
+    offsetInc2=windowShift2;
+    for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW
+        for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW
+            for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW
+                nw++;
+                ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2);
+            }
+        }
+    }
+    return ssimSum/nw;
+}
+
+
+/** ************************************************************************
+ * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data.
+ *        API for computing PSNR.
+ * 
+ * @param   nbEle           the length of float array
+ * @param   ori_data        original float array
+ * @param   dec_data        decompressed (reconstructed) float array
+ * 
+ * @return  result          6-length double array, which contains:
+ *                              0. *Mean Square Error (MSE)*
+ *                              1. *Value Range (Max-Min)*
+ *                              2. *Peak Signal-to-noise Ratio (PSNR)*
+ *                              3. Squared Error
+ *                              4. Normalized Squared Error
+ *                              5. Normalized Squared MSE
+ * *********************************************************************** */
+double *computePSNR(size_t nbEle, float *ori_data, float *data) {
+    size_t i = 0;
+    double Max = 0, Min = 0, diffMax = 0;
+    Max = ori_data[0];
+    Min = ori_data[0];
+    diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0];
+
+    //diffMax = fabs(data[0] - ori_data[0]);
+    double sum1 = 0, sum2 = 0, sum22 = 0;
+
+    for (i = 0; i < nbEle; i++) {
+        sum1 += ori_data[i];
+        sum2 += data[i];
+        sum22 += data[i] * data[i];
+    }
+    double mean1 = sum1 / nbEle;
+    double mean2 = sum2 / nbEle;
+
+    double sum3 = 0, sum4 = 0;
+    double sum = 0, prodSum = 0, relerr = 0;
+
+    double maxpw_relerr = 0;
+    for (i = 0; i < nbEle; i++) {
+        if (Max < ori_data[i]) Max = ori_data[i];
+        if (Min > ori_data[i]) Min = ori_data[i];
+
+        float err = fabs(data[i] - ori_data[i]);
+        if (ori_data[i] != 0) {
+            relerr = err / fabs(ori_data[i]);
+            if (maxpw_relerr < relerr)
+                maxpw_relerr = relerr;
+        }
+
+        if (diffMax < err)
+            diffMax = err;
+        prodSum += (ori_data[i] - mean1) * (data[i] - mean2);
+        sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1);
+        sum4 += (data[i] - mean2) * (data[i] - mean2);
+        sum += err * err;
+    }
+    double std1 = sqrt(sum3 / nbEle);
+    double std2 = sqrt(sum4 / nbEle);
+    double ee = prodSum / nbEle;
+    double acEff = ee / std1 / std2;
+
+    double mse = sum / nbEle;
+    double range = Max - Min;
+    double psnr = 20 * log10(range) - 10 * log10(mse);
+    double normErr = sqrt(sum);
+    double normErr_norm = normErr / sqrt(sum22);
+    double nrmse = sqrt(mse) / range;
+    double *result = (double *) malloc(sizeof(double) * 6);
+    result[0] = mse;
+    result[1] = range;
+    result[2] = psnr;
+    result[3] = normErr;
+    result[4] = normErr_norm;
+    result[5] = nrmse;
+
+    return result;
+}
\ No newline at end of file
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.cu b/qtensor/compression/szp/src/cuSZp_wrapper.cu
new file mode 100644
index 00000000..803dbbe1
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.cu
@@ -0,0 +1,37 @@
+#include "cuSZp_entry.h"
+#include "cuSZp_timer.h"
+#include "cuSZp_utility.h"
+#include "cuSZp.h"
+
+
+extern "C"{
+    /** Before entering SZp_compress, must allocate on device:
+     * - d_cmpBytes
+    */
+    unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+        unsigned char *d_cmpBytes, *d_finalCmpBytes;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle);
+        SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream);
+        cudaMalloc((void**)&d_finalCmpBytes, *outSize);
+        cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice);
+        cudaFree(d_cmpBytes);
+	//cudaFree(oriData);
+        return d_finalCmpBytes;
+    }
+
+    /** Before entering SZp_decompress, must allocate on device:
+     * - d_decData
+    */
+    float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+        float *d_decData;
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        cudaMalloc((void**)&d_decData, sizeof(float)*nbEle);
+        SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream);
+        cudaFree(cmpBytes);
+	return d_decData;
+    }
+    
+}
diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py
new file mode 100644
index 00000000..4e887a3b
--- /dev/null
+++ b/qtensor/compression/szp/src/cuSZp_wrapper.py
@@ -0,0 +1,190 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so')
+#LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so'
+# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){
+
+def get_device_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){
+
+def get_device_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZp_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float]
+    func.restype = POINTER(c_float)
+    return func
+
+
+
+def cuszp_device_compress(oriData, absErrBound, nbEle,threshold):
+    __cuszp_device_compress = get_device_compress()
+    
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+    
+    oriData = oriData.flatten()
+    #ori_real = oriData.real
+    #ori_imag = oriData.imag
+    #oriData = cp.concatenate((ori_real, ori_imag))
+    #sample = oriData[::2]
+    
+    
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = cp.absolute(oriData)<=threshold
+    #oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    # oriData = oriData[truth_values]
+    bitmap = truth_values
+    nbEle = oriData.shape[0]*2
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting") 
+    # float *oriData, size_t *outSize, float absErrBound, size_t nbEle
+    o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle))
+
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    #del oriData
+
+    #print("tg and max time (s): "+str(time.time()-s_1))
+    #print("bitmap shape: "+str(bitmap.shape[0]))
+    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
+    #print("CR")
+    #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    return (o_bytes,bitmap, absErrBound), outSize
+
+
+def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype):
+    __cuszp_device_decompress=get_device_decompress()
+    (cmpBytes, bitmap, absErrBound) = cmpBytes
+    #print("bitmap len:" +str(len(bitmap)))
+    #print(nbEle)
+    #tmp_nbEle = nbEle
+    # tmp_nbEle = cp.count_nonzero(bitmap).item()
+#    print(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(nbEle)
+    # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound
+    newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound))
+
+    # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr)
+#    print("attempt alloc")
+    # res = cp.zeros(nbEle,dtype=cp.float32)
+#    print("alloc passed")
+    ## need to convert newData to cupy
+    # cp.putmask(res,bitmap,arr)
+    mempool = cp.get_default_memory_pool()
+    pinned_mempool = cp.get_default_pinned_memory_pool()
+    #del arr
+    
+    #print(res[0])
+    #print(res[int(nbEle/2)])
+    #reshaped_data = arr.reshape(-1,2)
+    reshaped_data = arr.reshape(-1,2)
+    #c_res = arr
+    c_res = reshaped_data.view(dtype=np.complex64)
+    #print(c_res[0])
+    #c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = res[0:int(nbEle/2)]
+    #c_res.imag = res[int(nbEle/2):]
+    #del res
+    #del bitmap
+    #mempool.free_all_blocks()
+    #pinned_mempool.free_all_blocks()
+
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("real_sample.bin", dtype=np.float32)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(30):
+        s_time = time.time()
+        o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype)
+    
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/szx/cuda-samples b/qtensor/compression/szx/cuda-samples
new file mode 160000
index 00000000..e4789153
--- /dev/null
+++ b/qtensor/compression/szx/cuda-samples
@@ -0,0 +1 @@
+Subproject commit e4789153d539b2d2f3976050057a52a1518abcf0
diff --git a/qtensor/compression/szx/include/DynamicByteArray.h b/qtensor/compression/szx/include/DynamicByteArray.h
new file mode 100644
index 00000000..9f50a2ac
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicByteArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicByteArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Byte Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicByteArray_H
+#define _DynamicByteArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicByteArray
+{
+	unsigned char* array;
+	size_t size;
+	size_t capacity;
+} DynamicByteArray;
+
+void new_DBA(DynamicByteArray **dba, size_t cap);
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes);
+void free_DBA(DynamicByteArray *dba);
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos);
+extern void addDBA_Data(DynamicByteArray *dba, unsigned char value);
+extern void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicByteArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicDoubleArray.h b/qtensor/compression/szx/include/DynamicDoubleArray.h
new file mode 100644
index 00000000..9a3ef4b6
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicDoubleArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicDoubleArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Double Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicDoubleArray_H
+#define _DynamicDoubleArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct DynamicDoubleArray
+{	
+	double* array;
+	size_t size;
+	double capacity;
+} DynamicDoubleArray;
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap);
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data);
+void free_DDA(DynamicDoubleArray *dda);
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos);
+void addDDA_Data(DynamicDoubleArray *dda, double value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicDoubleArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicFloatArray.h b/qtensor/compression/szx/include/DynamicFloatArray.h
new file mode 100644
index 00000000..2770f786
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicFloatArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicFloatArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Float Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicFloatArray_H
+#define _DynamicFloatArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicFloatArray
+{	
+	float* array;
+	size_t size;
+	size_t capacity;
+} DynamicFloatArray;
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap);
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data);
+void free_DFA(DynamicFloatArray *dfa);
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos);
+void addDFA_Data(DynamicFloatArray *dfa, float value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicFloatArray_H  ----- */
diff --git a/qtensor/compression/szx/include/DynamicIntArray.h b/qtensor/compression/szx/include/DynamicIntArray.h
new file mode 100644
index 00000000..b9c0a4f3
--- /dev/null
+++ b/qtensor/compression/szx/include/DynamicIntArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicIntArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Int Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicIntArray_H
+#define _DynamicIntArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicIntArray
+{
+	unsigned char* array; //char* (one byte) is enough, don't have to be int*
+	size_t size;
+	size_t capacity;
+} DynamicIntArray;
+
+void new_DIA(DynamicIntArray **dia, size_t cap);
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data);
+void free_DIA(DynamicIntArray *dia);
+int getDIA_Data(DynamicIntArray *dia, size_t pos);
+extern void addDIA_Data(DynamicIntArray *dia, int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicIntArray_H  ----- */
diff --git a/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h b/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h
new file mode 100644
index 00000000..853d14bc
--- /dev/null
+++ b/qtensor/compression/szx/include/MultiLevelCacheTableWideInterval.h
@@ -0,0 +1,54 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file for MultiLevelCacheTableWideInterval.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#ifndef _MULTILEVELCACHETABLEWIDEINTERVAL_H
+#define _MULTILEVELCACHETABLEWIDEINTERVAL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTableWideInterval{
+    uint64_t baseIndex;
+    uint64_t topIndex;
+    uint16_t* table;
+    uint16_t expoIndex;
+} SubLevelTableWideInterval;
+
+typedef struct TopLevelTableWideInterval{
+    uint16_t bits;
+    uint16_t baseIndex;
+    uint16_t topIndex;
+    struct SubLevelTableWideInterval* subTables;
+    double bottomBoundary;
+    double topBoundary;
+} TopLevelTableWideInterval;
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable);
+
+uint16_t MLCTWI_GetExpoIndex(double value);
+uint16_t MLCTWI_GetRequiredBits(double precision);
+uint64_t MLCTWI_GetMantiIndex(double value, int bits);
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits);
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits);
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable);
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLEWIDEINTERVAL_H
diff --git a/qtensor/compression/szx/include/cuszx_entry.h b/qtensor/compression/szx/include/cuszx_entry.h
new file mode 100644
index 00000000..34638319
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszx_entry.h
@@ -0,0 +1,18 @@
+#ifndef CUSZX_ENTRY_H
+#define CUSZX_ENTRY_H
+
+#include <stdio.h>
+#include "cuszx_float.h"
+#include "cuszxd_float.h"
+
+#define GPU
+
+extern "C" unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold);
+
+extern "C" void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+
+extern "C" unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold);
+
+extern "C" float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes);
+
+#endif /* ----- #ifndef CUSZX_ENTRY_H  ----- */
diff --git a/qtensor/compression/szx/include/cuszx_float.h b/qtensor/compression/szx/include/cuszx_float.h
new file mode 100644
index 00000000..a933c2db
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszx_float.h
@@ -0,0 +1,22 @@
+#ifndef CUSZX_FLOAT_H
+#define CUSZX_FLOAT_H
+
+#include <cuda_runtime.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>  // helper function CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+#define FULL_MASK 0xffffffff
+
+__device__
+void reduction(double sum1, double sum2,
+        double minDiff, double maxDiff, double sumDiff, double sumOfDiffSquare, 
+        double minErr, double maxErr, double sumErr, double sumErrSqr);
+
+__global__ void compress_float(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, float absErrBound, int bs, size_t nb, size_t mSize, float sparsity_level, uint32_t *blk_idx, uint8_t *blk_subidx,float *blk_vals,float threshold, uint8_t *blk_sig); 
+
+__global__ void get_numsig(uint64_t *num_sig);
+
+__global__ void apply_threshold(float *data, float threshold, size_t length);
+#endif /* ----- #ifndef CUSZX_COMPRESS_FLOAT_H  ----- */
diff --git a/qtensor/compression/szx/include/cuszxd_float.h b/qtensor/compression/szx/include/cuszxd_float.h
new file mode 100644
index 00000000..b203f707
--- /dev/null
+++ b/qtensor/compression/szx/include/cuszxd_float.h
@@ -0,0 +1,14 @@
+#ifndef CUSZXD_FLOAT_H
+#define CUSZXD_FLOAT_H
+
+#include <cuda_runtime.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>  // helper function CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+__global__ void decompress_float(unsigned char *data, int bs, size_t nc, size_t mSize); 
+
+__global__ void decompress_state2(float *out, unsigned char* stateArray, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx,uint32_t blockSize, uint8_t *blk_sig);
+
+#endif /* ----- #ifndef CUSZX_DECOMPRESS_FLOAT_H  ----- */
diff --git a/qtensor/compression/szx/include/szx.h b/qtensor/compression/szx/include/szx.h
new file mode 100644
index 00000000..a6872950
--- /dev/null
+++ b/qtensor/compression/szx/include/szx.h
@@ -0,0 +1,92 @@
+/**
+ *  @file szx.h
+ *  @author Sheng Di
+ *  @date April, 2022
+ *  @brief Header file for the whole compressor.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_H
+#define _SZX_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>      /* For gettimeofday(), in microseconds */
+#include <time.h>          /* For time(), in seconds */
+#include "szx_float.h"
+#include "szx_rw.h"
+#include "szx_utility.h"
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "szx_defines.h"
+#include "szx_double.h"
+#include "szxd_double.h"
+#include "szx_float.h"
+#include "szxd_float.h"
+#include "szx_TypeManager.h"
+
+typedef union lint16
+{
+	unsigned short usvalue;
+	short svalue;
+	unsigned char byte[2];
+} lint16;
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union lint64
+{
+	long lvalue;
+	unsigned long ulvalue;
+	unsigned char byte[8];
+} lint64;
+
+typedef union ldouble
+{
+    double value;
+    unsigned long lvalue;
+    unsigned char byte[8];
+} ldouble;
+
+typedef union lfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} lfloat;
+
+
+extern int versionNumber[4];
+
+//-------------------key global variables--------------
+extern int dataEndianType; //*endian type of the data read from disk
+extern int sysEndianType; //*sysEndianType is actually set automatically.
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int filterDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t* correctedDimension);
+unsigned char* SZ_fast_compress_args(int fastMode, int dataType, void *data, size_t *outSize, int errBoundMode, float absErrBound,
+float relBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* SZ_fast_decompress_pred(int dataType, float* preData, unsigned char *curBytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* SZ_fast_decompress(int fastMode, int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_H  ----- */
diff --git a/qtensor/compression/szx/include/szx_BytesToolkit.h b/qtensor/compression/szx/include/szx_BytesToolkit.h
new file mode 100644
index 00000000..027afe97
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_BytesToolkit.h
@@ -0,0 +1,75 @@
+/**
+ *  @file szx_ByteToolkit.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_ByteToolkit_H
+#define _SZX_ByteToolkit_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+void sz_writeBits_Fast_int8(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, unsigned char data);
+void sz_writeBits_Fast_int32(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int32_t data);
+void sz_writeBits_Fast_int64(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int64_t data);
+unsigned short bytesToUInt16_bigEndian(unsigned char* bytes);
+unsigned int bytesToUInt32_bigEndian(unsigned char* bytes);
+unsigned long bytesToUInt64_bigEndian(unsigned char* b);
+short bytesToInt16_bigEndian(unsigned char* bytes);
+int bytesToInt32_bigEndian(unsigned char* bytes);
+long bytesToInt64_bigEndian(unsigned char* b);
+int bytesToInt_bigEndian(unsigned char* bytes);
+void intToBytes_bigEndian(unsigned char *b, unsigned int num);
+void int64ToBytes_bigEndian(unsigned char *b, uint64_t num);
+void int32ToBytes_bigEndian(unsigned char *b, uint32_t num);
+void int16ToBytes_bigEndian(unsigned char *b, uint16_t num);
+long bytesToLong_bigEndian(unsigned char* b);
+void longToBytes_bigEndian(unsigned char *b, unsigned long num) ;
+long doubleToOSEndianLong(double value);
+int floatToOSEndianInt(float value);
+short getExponent_float(float value);
+short getPrecisionReqLength_float(float precision);
+short getExponent_double(double value);
+short getPrecisionReqLength_double(double precision);
+unsigned char numberOfLeadingZeros_Int(int i);
+unsigned char numberOfLeadingZeros_Long(long i);
+unsigned char getLeadingNumbers_Int(int v1, int v2);
+unsigned char getLeadingNumbers_Long(long v1, long v2);
+short bytesToShort(unsigned char* bytes);
+void shortToBytes(unsigned char* b, short value);
+int bytesToInt(unsigned char* bytes);
+long bytesToLong(unsigned char* bytes);
+float bytesToFloat(unsigned char* bytes);
+void floatToBytes(unsigned char *b, float num);
+double bytesToDouble(unsigned char* bytes);
+void doubleToBytes(unsigned char *b, double num);
+int getMaskRightCode(int m);
+int getLeftMovingCode(int kMod8);
+int getRightMovingSteps(int kMod8, int resiBitLength);
+int getRightMovingCode(int kMod8, int resiBitLength);
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength);
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength);
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes);
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes);
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes);
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes);
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes);
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes);
+size_t bytesToSize(unsigned char* bytes);
+void sizeToBytes(unsigned char* outBytes, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_ByteToolkit_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_TypeManager.h b/qtensor/compression/szx/include/szx_TypeManager.h
new file mode 100644
index 00000000..f4409104
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_TypeManager.h
@@ -0,0 +1,35 @@
+/**
+ *  @file TypeManager.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the TypeManager.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_TypeManager_H
+#define _SZX_TypeManager_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+size_t convertIntArray2ByteArray_fast_1b_args(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+void convertByteArray2IntArray_fast_1b_args(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char* intArray);
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_2b_args(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result);
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_TypeManager_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_dataCompression.h b/qtensor/compression/szx/include/szx_dataCompression.h
new file mode 100644
index 00000000..afce931b
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_dataCompression.h
@@ -0,0 +1,67 @@
+/**
+ *  @file szx_dataCompression.h
+ *  @author Sheng Di
+ *  @date July, 2022
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_DataCompression_H
+#define _SZX_DataCompression_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "szx.h"
+#include <stdio.h>
+#include <stdbool.h>
+
+#define computeMinMax(data) \
+        for(i=1;i<size;i++)\
+        {\
+                data_ = data[i];\
+                if(min>data_)\
+                        min = data_;\
+                else if(max<data_)\
+                        max = data_;\
+        }\
+
+
+//dataCompression.c
+int computeByteSizePerIntValue(long valueRangeSize);
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize);
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue);
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue);
+
+double min_d(double a, double b);
+double max_d(double a, double b);
+float min_f(float a, float b);
+float max_f(float a, float b);
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+void symTransform_8bytes(unsigned char data[8]);
+void symTransform_2bytes(unsigned char data[2]);
+void symTransform_4bytes(unsigned char data[4]);
+
+void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes);
+void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes);
+void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes);
+void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes);
+    
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes);
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_DataCompression_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_defines.h b/qtensor/compression/szx/include/szx_defines.h
new file mode 100644
index 00000000..66cc07ca
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_defines.h
@@ -0,0 +1,66 @@
+/**
+ *  @file szx_defines.h
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_DEFINES_H
+#define _SZX_DEFINES_H
+
+#define SZx_VERNUM 0x0200
+#define SZx_VER_MAJOR 1
+#define SZx_VER_MINOR 0
+#define SZx_VER_BUILD 0
+#define SZx_VER_REVISION 0
+
+#define ABS 0
+#define REL 1
+#define VR_REL 1  //alternative name to REL
+#define ABS_AND_REL 2
+#define ABS_OR_REL 3
+#define PSNR 4
+#define NORM 5
+
+#define PW_REL 10
+#define ABS_AND_PW_REL 11
+#define ABS_OR_PW_REL 12
+#define REL_AND_PW_REL 13
+#define REL_OR_PW_REL 14
+
+
+#define SZ_FLOAT 0
+#define SZ_DOUBLE 1
+#define SZ_UINT8 2
+#define SZ_INT8 3
+#define SZ_UINT16 4
+#define SZ_INT16 5
+#define SZ_UINT32 6
+#define SZ_INT32 7
+#define SZ_UINT64 8
+#define SZ_INT64 9
+
+#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
+#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
+
+#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
+#define BIG_ENDIAN_SYSTEM 1
+
+
+#define SZx_NO_BLOCK_FAST_CMPR 1
+#define SZx_WITH_BLOCK_FAST_CMPR 2
+#define SZx_RANDOMACCESS_FAST_CMPR 3
+#define SZx_OPENMP_FAST_CMPR 4
+
+//SUCCESS returning status
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define SZ_DERR -4 //dimension error
+#define SZ_MERR -5 //sz_mode error
+#define SZ_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
+
+#endif /* _SZX_DEFINES_H */
diff --git a/qtensor/compression/szx/include/szx_double.h b/qtensor/compression/szx/include/szx_double.h
new file mode 100644
index 00000000..e6d724db
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_double.h
@@ -0,0 +1,62 @@
+/**
+ *  @file szx_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <szx_float.h>
+
+#ifndef _SZ_Double_H
+#define _SZ_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SZ_fast_compress_args_unpredictable_one_block_double(double *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius);
+                                                                
+size_t computeStateMedianRadius_double(double *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) ;
+                                      
+void max_min_double(double *x, int n, double *tmp_max, double *tmp_min);
+
+void simd_max_min_double(double *x, int n, double *tmp_max, double *tmp_min);
+
+void computeStateMedianRadius_double2(double *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) ;
+                                     
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_double(double *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) ;
+                                                  
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(double *oriData, size_t *outSize, 
+									float absErrBound, size_t nbEle, int blockSize) ;
+                                                               
+                                                               
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(double *oriData, size_t *outSize, 
+								float absErrBound, size_t nbEle, int blockSize) ;
+    
+unsigned char *
+SZ_fast_compress_args_unpredictable_double(double *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius);
+                                          
+unsigned char *SZ_skip_compress_double(double *data, size_t dataLength, size_t *outSize) ;
+
+void computeReqLength_double(float realPrecision, short radExpo, int *reqLength, float *medianValue) ;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_float.h b/qtensor/compression/szx/include/szx_float.h
new file mode 100644
index 00000000..57e6388f
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_float.h
@@ -0,0 +1,63 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Float_H
+#define _SZ_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char * SZ_fast_compress_args_with_prediction_float(float *pred, float *data, size_t *outSize, float absErrBound, size_t r5,
+                                            size_t r4, size_t r3, size_t r2, size_t r1, float medianValue, float radius);
+
+void SZ_fast_compress_args_unpredictable_one_block_float(float *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius);
+                                                                
+size_t computeStateMedianRadius_float(float *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) ;
+                                      
+void max_min_float(float *x, int n, float *tmp_max, float *tmp_min);
+
+void simd_max_min_float(float *x, int n, float *tmp_max, float *tmp_min);
+
+void computeStateMedianRadius_float2(float *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) ;
+                                     
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) ;
+                                                  
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(float *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) ;
+                                                               
+                                                               
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(float *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) ;
+    
+unsigned char *
+SZ_fast_compress_args_unpredictable_float(float *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius);
+                                          
+unsigned char *SZ_skip_compress_float(float *data, size_t dataLength, size_t *outSize) ;
+
+void computeReqLength_float(double realPrecision, short radExpo, int *reqLength, float *medianValue) ;
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_H  ----- */
+
diff --git a/qtensor/compression/szx/include/szx_rw.h b/qtensor/compression/szx/include/szx_rw.h
new file mode 100644
index 00000000..551dea0f
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_rw.h
@@ -0,0 +1,89 @@
+/**
+ *  @file szx_rw.h
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief Header file for the whole io interface.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_RW_H
+#define _SZX_RW_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int checkFileExistance(char* filePath);
+
+float** create2DArray_float(size_t m, size_t n);
+void free2DArray_float(float** data, size_t m);
+float*** create3DArray_float(size_t p, size_t m, size_t n);
+void free3DArray_float(float*** data, size_t p, size_t m);
+double** create2DArray_double(size_t m, size_t n);
+void free2DArray_double(double** data, size_t m);
+double*** create3DArray_double(size_t p, size_t m, size_t n);
+void free3DArray_double(double*** data, size_t p, size_t m);
+size_t checkFileSize(char *srcFilePath, int *status);
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status);
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status);
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status);
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status);
+
+//void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status);
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize);
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength);
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle);
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle);
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len);
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len);
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_RW_H  ----- */
diff --git a/qtensor/compression/szx/include/szx_utility.h b/qtensor/compression/szx/include/szx_utility.h
new file mode 100644
index 00000000..133c2816
--- /dev/null
+++ b/qtensor/compression/szx/include/szx_utility.h
@@ -0,0 +1,37 @@
+/**
+ *  @file szx_utility.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the utility.c.
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZX_UTILITY_H
+#define _SZX_UTILITY_H
+
+#include "szx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//sihuan added: use a assistant struct to do sorting and swap that are easy to implement: should
+//consider optimizing the performance later.
+typedef struct sort_ast_particle{
+	int64_t id;
+	float var[6];
+} sort_ast_particle;
+
+extern struct timeval sz_costStart; /*only used for recording the cost*/
+extern double sz_totalCost;
+
+void sz_cost_start();
+void sz_cost_end();
+void sz_cost_end_msg(char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZX_UTILITY_H  ----- */
diff --git a/qtensor/compression/szx/include/szxd_double.h b/qtensor/compression/szx/include/szxd_double.h
new file mode 100644
index 00000000..4ea4be11
--- /dev/null
+++ b/qtensor/compression/szx/include/szxd_double.h
@@ -0,0 +1,29 @@
+/**
+ *  @file szxd_double.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the szd_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZXD_Double_H
+#define _SZXD_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int SZ_fast_decompress_args_unpredictable_one_block_double(double* newData, size_t blockSize, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_double(double** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(double** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(double** newData, size_t nbEle, unsigned char* cmpBytes);
+
+void SZ_fast_decompress_args_unpredictable_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZXD_Double_H  ----- */
diff --git a/qtensor/compression/szx/include/szxd_float.h b/qtensor/compression/szx/include/szxd_float.h
new file mode 100644
index 00000000..fbe0219d
--- /dev/null
+++ b/qtensor/compression/szx/include/szxd_float.h
@@ -0,0 +1,30 @@
+/**
+ *  @file szxd_float.h
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Header file for the szd_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZXD_Float_H
+#define _SZXD_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SZ_fast_decompress_args_with_prediction_float(float** newData, float* pred, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+int SZ_fast_decompress_args_unpredictable_one_block_float(float* newData, size_t blockSize, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(float** newData, size_t nbEle, unsigned char* cmpBytes);
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(float** newData, size_t nbEle, unsigned char* cmpBytes);
+
+void SZ_fast_decompress_args_unpredictable_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, 
+size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZXD_Float_H  ----- */
diff --git a/qtensor/compression/szx/include/timingGPU.h b/qtensor/compression/szx/include/timingGPU.h
new file mode 100644
index 00000000..c6081682
--- /dev/null
+++ b/qtensor/compression/szx/include/timingGPU.h
@@ -0,0 +1,31 @@
+#ifndef __TIMING_CUH__
+#define __TIMING_CUH__
+
+/**************/
+/* TIMING GPU */
+/**************/
+
+// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
+// microsecond precision.
+
+struct PrivateTimingGPU;
+
+class TimingGPU
+{
+    private:
+        PrivateTimingGPU *privateTimingGPU;
+
+    public:
+
+        TimingGPU();
+
+        ~TimingGPU();
+
+        void StartCounter();
+        void StartCounterFlags();
+
+        float GetCounter();
+
+}; // TimingGPU class
+
+#endif
diff --git a/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c b/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c
new file mode 100644
index 00000000..d137115f
--- /dev/null
+++ b/qtensor/compression/szx/src/MultiLevelCacheTableWideInterval.c
@@ -0,0 +1,125 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdbool.h>
+#include "MultiLevelCacheTableWideInterval.h"
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable)
+{
+	for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--)
+	{
+		struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+		free(processingSubTable->table);
+	}
+	free(topTable->subTables);
+}
+
+uint16_t MLCTWI_GetExpoIndex(double value){
+    uint64_t* ptr = (uint64_t*)&value;
+    return (*ptr) >> 52;
+}
+
+uint16_t MLCTWI_GetRequiredBits(double precision){
+    uint64_t* ptr = (uint64_t*)&precision;
+    return -(((*ptr) >> 52) - 1023);
+}
+
+uint64_t MLCTWI_GetMantiIndex(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    (*ptr) = (*ptr) << 12 >> 12;
+    int shift = 64 - 12 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits){
+    double result = 0;
+    uint64_t *ptr = (uint64_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 52;
+    (*ptr) += (manti << (52-bits));
+    return result;
+}
+
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits){
+    uint16_t bits = MLCTWI_GetRequiredBits(precision) + plus_bits;
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCTWI_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCTWI_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTableWideInterval*)malloc(sizeof(struct SubLevelTableWideInterval) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTableWideInterval) * subTableCount);
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+
+        uint32_t maxIndex = 0;
+        for(int j=0; j<bits; j++){
+            maxIndex += 1 << j;
+        }
+        processingSubTable->topIndex = maxIndex;
+        processingSubTable->baseIndex = 0;
+
+        uint64_t subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint16_t*)malloc(sizeof(uint16_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint16_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+
+    uint32_t index = 0;
+    bool flag = false;
+    for(uint16_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+        uint16_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint64_t mantiIndex = j + processingSubTable->baseIndex;
+            double sampleBottom = MLTCWI_RebuildDouble(expoIndex, mantiIndex, topTable->bits);
+            double sampleTop = MLTCWI_RebuildDouble(expoIndex, mantiIndex+1, topTable->bits);
+            double bottomBoundary = precisionTable[index] / (1+precision);
+            double topBoundary = precisionTable[index] / (1-precision);
+            if(sampleTop < topBoundary && sampleBottom > bottomBoundary){
+                processingSubTable->table[j] = index;
+                flag = true;
+            }else{
+                if(flag && index < count-1){
+                    index++;
+                    processingSubTable->table[j] = index;
+                }else{
+                    processingSubTable->table[j] = 0;
+                }
+            }
+        }
+    }
+
+}
+
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable){
+    uint16_t expoIndex = MLCTWI_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTableWideInterval* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint64_t mantiIndex = MLCTWI_GetMantiIndex(value, topLevelTable->bits);
+        return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
+
diff --git a/qtensor/compression/szx/src/README_python.md b/qtensor/compression/szx/src/README_python.md
new file mode 100644
index 00000000..0754950d
--- /dev/null
+++ b/qtensor/compression/szx/src/README_python.md
@@ -0,0 +1,30 @@
+# Using the Python Wrapper for QC Compression
+### Steps to Build:
+1. Clone the repository, switch to threshold_integrate branch
+
+2. Change directory to "SZx/szx/src/"
+
+3. Run the following NVCC command:
+nvcc --shared --compiler-options '-fPIC' -I ../include/ -I $CUDA_SAMPLES_PATH -o libcuszx_wrapper.so *.cu *.c
+
+    - $CUDA_SAMPLES_PATH should be the path to the include/ directory of CUDA's samples
+
+### Using the Python API:
+**def cuszx_device_compress(oriData, outSize, absErrBound, nbEle, blockSize,threshold)**
+- Parameters:
+    - oriData: CUPY array to be compressed, should be flattened to 1-D
+    - outSize: CTypes size_t pointer, will store the resulting compressed data size in bytes
+    - absErrBound: Float, the relative-to-value-range error bound for compression
+    - nbEle: Integer, number of data elements
+    - blockSize: Integer, cuSZx runtime parameter (recommended value = 256)
+    - threshold: Float, the relative-to-value-range threshold for compression
+- Returns:
+    - o_bytes: GPU device pointer to compressed bytes
+    - outSize: See 'Parameters'
+
+**def cuszx_device_decompress(nbEle, cmpBytes)**
+- Parameters:
+    - nbEle: Integer, number of data elements
+    - cmpBytes: GPU device pointer to compressed bytes
+- Returns:
+    - newData: GPU float pointer (CTypes) to decompressed data
diff --git a/qtensor/compression/szx/src/cuszx_entry.cu b/qtensor/compression/szx/src/cuszx_entry.cu
new file mode 100644
index 00000000..213cb689
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_entry.cu
@@ -0,0 +1,1961 @@
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+#include "szx.h"
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <cub/cub.cuh>
+#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cub/cub.cuh>
+
+#define SPARSITY_LEVEL 0.25
+#define BLOCKS 40
+#define THREADS_PER_BLOCK 256
+
+TimingGPU timer_GPU;
+void bin(unsigned n)
+{
+    unsigned i;
+    for (i = 1 << 31; i > 0; i = i / 2)
+        (n & i) ? printf("1") : printf("0");
+}
+
+__host__ __device__ size_t convert_state_to_out(unsigned char* meta, size_t length, unsigned char *result){
+    size_t out_length;
+
+    if(length%4==0)
+		out_length = length/4;
+	else
+		out_length = length/4+1;
+
+    for (size_t i = 0; i < out_length; i++)
+    {
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+    return out_length;
+}
+
+__global__ void convert_state_to_out_kernel(unsigned char* meta, size_t length, unsigned char *result, size_t out_length){
+    
+
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < out_length; i += blockDim.x*gridDim.x){
+        uint8_t tmp = 0;
+
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (i*4 + j < length)
+            {
+                tmp |= (0x03 & meta[i*4+j]) << 2*j;
+            }
+            
+        }
+        result[i] = tmp;
+    }
+}
+
+__global__ void convert_out_to_state_kernel(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state, size_t state_length, int *num_state2blks, int *ncBlocks){
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < state_length; i += blockDim.x*gridDim.x){
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+                if (out_state[4*i+j] == 2)
+                {
+                    atomicAdd(num_state2blks, 1);
+                }else if(out_state[4*i+j]==3){
+                    atomicAdd(ncBlocks, 1);
+                }
+                
+            }
+            
+        }
+    }
+}
+
+// nbBlocks, r, stateNBBytes, stateArray
+__host__ __device__ size_t convert_out_to_state(size_t nbBlocks, unsigned char* cmp, unsigned char* out_state){
+    size_t state_length;
+    if(nbBlocks%4==0)
+		state_length = nbBlocks/4;
+	else
+		state_length = nbBlocks/4+1;
+
+    for (size_t i = 0; i < state_length; i++)
+    {
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (4*i + j < nbBlocks)
+            {
+                out_state[4*i + j]= (cmp[i] >> 2*j) & 0x03;
+            }
+            
+        }
+    }
+    return nbBlocks;
+}
+
+__host__ __device__ size_t convert_block2_to_out(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(result+out_length, blk_subidx, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(result+out_length, blk_sig, numBlocks*sizeof(uint8_t));
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    return out_length;
+}
+
+__global__ void convert_block2_to_out_kernel(unsigned char *result, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    
+    size_t out_length = 0;
+    unsigned char *tmp_result = result;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        uint32_t local_blkidx = blk_idx[i];
+        tmp_result[4*i] = (local_blkidx) & 0xff;
+        tmp_result[4*i+1] = (local_blkidx >> (8*1)) & 0xff;
+        tmp_result[4*i+2] = (local_blkidx >> (8*2)) & 0xff;
+        tmp_result[4*i+3] = (local_blkidx >> (8*3)) & 0xff;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = blk_vals[i];
+	    memcpy(&tmp_result[4*i], &value, sizeof(float));
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_subidx[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = result+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        tmp_result[i] = blk_sig[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+
+    // return out_length;
+}
+
+__global__ void convert_out_to_block2_kernel(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    
+    unsigned char *tmp_result = in_cmp;
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        
+        uint32_t local_blkidx = (tmp_result[4*i] & 0xff) | ((tmp_result[4*i+1] & 0xff) << (8*1)) 
+                                | ((tmp_result[4*i+2] & 0xff) << (8*2)) | ((tmp_result[4*i+3] & 0xff) << (8*3));
+        blk_idx[i] = local_blkidx;
+    }
+    // memcpy(result, blk_idx, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        float value = 0.0;
+        memcpy(&value, &tmp_result[4*i], sizeof(float));
+        blk_vals[i] = value;
+	    
+	//unsigned char *v = ()
+        //tmp_result[(int)4*i] = (unsigned char)((value) & 0xff);
+        //tmp_result[(int)4*i+1] = (unsigned char)((value >> (8*1)) & 0xff);
+        //tmp_result[(int)4*i+2] = (unsigned char)((value >> (8*2)) & 0xff);
+        //tmp_result[(int)4*i+3] = (unsigned char)((value >> (8*3)) & 0xff);
+    }
+    // memcpy(result+out_length, blk_vals, num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < num_sig; i += blockDim.x*gridDim.x){
+        blk_subidx[i] = tmp_result[i];
+        
+    }
+
+    out_length += num_sig*sizeof(uint8_t);
+    tmp_result = in_cmp+out_length;
+    
+    for (int i = blockDim.x*blockIdx.x + threadIdx.x; i < numBlocks; i += blockDim.x*gridDim.x){
+        blk_sig[i] = tmp_result[i];
+        
+    }
+    out_length+= numBlocks*sizeof(uint8_t);
+}
+
+__host__ __device__ size_t convert_out_to_block2(unsigned char *in_cmp, uint32_t numBlocks, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    size_t out_length = 0;
+    memcpy(blk_idx, in_cmp, numBlocks*sizeof(uint32_t));
+    out_length += numBlocks*4;
+    memcpy(blk_vals, in_cmp+out_length,num_sig*sizeof(float));
+    out_length += num_sig*sizeof(float);
+    memcpy(blk_subidx, in_cmp+out_length, num_sig*sizeof(uint8_t));
+    out_length += num_sig*sizeof(uint8_t);
+    memcpy(blk_sig, in_cmp+out_length, numBlocks*sizeof(uint8_t));
+    out_length += numBlocks*sizeof(uint8_t);
+//    printf("outlength: %d\n",out_length);
+    return out_length;
+}
+
+int _post_proc(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, unsigned char *outBytes, size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+//    printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+  //  printf("accessing outbytes now...\n");
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	sizeToBytes(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    sizeToBytes(r, (size_t) num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+	    memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+        }else if(meta[i] == 3){
+            shortToBytes(o, offsets[i]);
+	   
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            
+	    nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            
+	    nc += offsets[i];
+	   
+        } 
+    }
+
+    // return out_size;
+    return (uint32_t) (nc-r_old);
+}
+
+unsigned char* cuSZx_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+//    printf("tr thresh abs %f %f\n", threshold, absErrBound);
+  //  printf("first: %f %f %f\n", oriData[0], oriData[1], oriData[2]);
+    float sparsity_level = SPARSITY_LEVEL;
+	float* d_oriData;
+    cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    unsigned char *meta = (unsigned char*)malloc(msz);
+    short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    timer_GPU.StartCounter();
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    memset(outBytes, 0, maxPreservedBufferSize);
+
+    outSize = (size_t *)malloc(sizeof(size_t));
+    //outSize[0] = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+
+    *outSize = _post_proc(oriData, meta, offsets, midBytes, outBytes, nbEle, blockSize, *num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+//    printf("Beginning free\n");
+    // printf("outsize %p \n", outBytes);
+    free(blk_idx);
+    free(blk_subidx);
+    free(blk_vals);
+    free(meta);
+    free(offsets);
+    free(midBytes);
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+    return outBytes;
+}
+
+void cuSZx_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    uint8_t *blk_sig, *d_blk_sig;
+    float *blk_vals, *d_blk_vals;
+    size_t num_sig, *d_num_sig;
+
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+    memset(*newData, 0, sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+	unsigned char* stateArray = (unsigned char*)malloc(nbBlocks);
+    unsigned char* d_stateArray;
+    cudaMalloc(&d_stateArray, nbBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+	
+    
+
+    blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+	// printf("Converting state array\n");
+    convert_out_to_state(nbBlocks, r, stateArray);
+	// convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    unsigned char* data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // printf("converting block vals\n");
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+    // checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    // num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, num_sig*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, num_sig*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMemcpy(d_blk_idx, blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_vals, blk_vals, (num_sig)*sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_subidx, blk_subidx, (num_sig)*sizeof(uint8_t), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_stateArray, stateArray, nbBlocks, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_blk_sig, blk_sig, nbBlocks*sizeof(uint8_t), cudaMemcpyHostToDevice));
+
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    memcpy((*newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+	float* fr = (float*)r; //fr is the starting address of constant median values.
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = fr[i];
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            exit(0);
+        }
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+        p += leng;
+    } 
+
+    unsigned char* d_data;
+    float *d_newdata;
+    checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks*blockSize*sizeof(float)));
+
+    timer_GPU.StartCounter();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks, 64>>>(d_newdata, d_stateArray,d_blk_idx, d_blk_vals, d_blk_subidx,blockSize, d_blk_sig);
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(d_data, blockSize, ncBlocks, mSize);
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+
+    int nb=0, nc=0;
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j=0;j<blockSize;j++)
+                *((*newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+	free(stateArray);
+	free(constantMedianArray);
+	free(data);
+    cudaFree(d_newdata);
+    cudaFree(d_stateArray);
+    checkCudaErrors(cudaFree(d_data));
+
+}
+
+__device__ inline void longToBytes_bigEndian_d(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+inline void longToBytes_bigEndian_memset(unsigned char *b, unsigned long num) 
+{
+    checkCudaErrors(cudaMemset(&b[0], (unsigned char)(num>>56), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[1], (unsigned char)(num>>48), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[2], (unsigned char)(num>>40), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[3], (unsigned char)(num>>32), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[4], (unsigned char)(num>>24), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[5], (unsigned char)(num>>16), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[6], (unsigned char)(num>>8), sizeof(char)));
+    checkCudaErrors(cudaMemset(&b[7], (unsigned char)(num), sizeof(char)));
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+__device__ inline void shortToBytes_d(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+
+
+__global__ void getNumNonConstantBlocks(size_t nbBlocks, short *offsets, unsigned char *meta, int blockSize, int *nonconstant, int *out_size){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (meta[tid] == 3){ 
+            atomicAdd(nonconstant, 1);
+            atomicAdd(out_size,1+(blockSize/4)+offsets[tid]);
+        }
+    }
+}
+
+__global__ void generateFlags(unsigned char *states, uint64_t *cBlk_flags, uint64_t *ncBlk_flags,uint64_t* offset_indices,short* offsets, size_t nbBlocks){
+    for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < nbBlocks; tid += blockDim.x*gridDim.x){
+        if (states[tid] == 0 || states[tid] == 1)
+        {
+            cBlk_flags[tid] = 1;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }else if(states[tid]==3){
+            ncBlk_flags[tid] = 1;
+            cBlk_flags[tid] = 0;
+            offset_indices[tid] = (uint64_t) offsets[tid];
+        }else{
+            cBlk_flags[tid] = 0;
+            ncBlk_flags[tid] = 0;
+            offset_indices[tid] = 0;
+        }
+        
+    }
+}
+
+__global__ void nccopy_kernel2(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    int num_threads = (blockDim.x*gridDim.x);
+    int tid = blockDim.x*blockIdx.x + threadIdx.x;
+    int blocks_per_thread = nbBlocks/num_threads;
+    int start_idx = tid*blocks_per_thread;
+    int end_idx = start_idx+blocks_per_thread;
+
+    if (tid == num_threads-1)
+    {
+        end_idx = nbBlocks;
+    }
+    
+    unsigned char* tmp_o = o+(sizeof(short)*ncBlk_indices[start_idx]);
+    unsigned char* tmp_nc= nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]);
+    for (i=start_idx; i<end_idx; i++){
+        if(meta[i] == 3){
+	
+            
+            shortToBytes_d(o, offsets[i]);
+            tmp_o += sizeof(short);
+            memcpy(tmp_nc, meta+(nbBlocks+i*mSize), mSize);
+            tmp_nc += mSize; 
+            memcpy(tmp_nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            tmp_nc += offsets[i];
+
+            // shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            
+            // memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+
+
+            // memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        } 
+    }
+    
+}
+
+
+__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, uint64_t *cBlk_indices, uint64_t *ncBlk_indices, uint64_t* offset_indices, size_t *final_nc){
+   // printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+    int i;
+    // if(threadIdx.x==0){
+	// printf("c: %ld nc: %ld\n", cBlk_indices[nbBlocks-1], ncBlk_indices[nbBlocks-1]);
+    // }
+    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+	        memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+	   
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }
+        else if(meta[i] == 3){
+	
+        //     printf("ncblk 1\n");
+            shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+             // o += sizeof(short);
+
+        //     printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+            // printf("nbBlkindices %ld offset_indices %ld\n", ncBlk_indices[i], offset_indices[i]);
+        //     printf(" test 1%c\n",meta+(nbBlocks+i*mSize));
+        //     printf("test 2%c\n", nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]));
+            memcpy(nc+((mSize*ncBlk_indices[i] + offset_indices[i])), meta+(nbBlocks+i*mSize), mSize);
+        //         // nc += mSize; 
+                
+        //     printf("ncblk 3\n");
+            memcpy(nc+(((mSize*ncBlk_indices[i])+mSize + offset_indices[i])), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+        //         // nc += offsets[i];
+            
+        //     printf("ncblk 4\n");
+        }
+        if (i==nbBlocks-1)
+        {
+            *final_nc = (size_t) (((mSize*ncBlk_indices[i])+mSize + offset_indices[i]))+offsets[i];
+	}
+        
+    }
+    
+}
+
+//__global__ void nccopy_kernel(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+//                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, int *cBlk_indices, int *ncBlk_indices, int* offset_indices){
+//    printf("blockdim %d blockidx %d threadidx %d griddim %d\n", blockDim.x, blockIdx.x, threadIdx.x, gridDim.x);
+//    int i;
+//    for (i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        //printf("meta %d i: %d\n",meta[i], i); 
+//        if (meta[i]==0 || meta[i] == 1){
+            // printf("cblk\n");
+//	    memcpy(c+(sizeof(float)*cBlk_indices[i]), meta+(nbBlocks+i*mSize), sizeof(float));
+
+            // printf("cblk done\n");
+	    // c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+//        }else if(meta[i] == 3){
+	
+//           printf("ncblk 1\n");
+//           shortToBytes_d(o+(sizeof(short)*ncBlk_indices[i]), offsets[i]);
+            // o += sizeof(short);
+
+//           printf("ncblk 2 nbBlocks %d %d \n", nbBlocks, i);
+//	   printf("nbBlkindices %d offset_indices %d\n", ncBlk_indices[i], offset_indices[i]);
+//	   memcpy(nc+(mSize*ncBlk_indices[i] + offset_indices[i]*ncBlk_indices[i]), meta+(nbBlocks+i*mSize), mSize);
+            // nc += mSize; 
+            
+//           printf("ncblk 3\n");
+//	   memcpy(nc+(mSize*(ncBlk_indices[i]+1) + offset_indices[i]*ncBlk_indices[i]), midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            // nc += offsets[i];
+        
+//           printf("ncblk 4\n");
+//	} 
+//    }
+    
+//}
+
+__global__ void set_nc(unsigned char *nc, short *offsets, uint64_t *offset_indices, uint64_t *ncBlk_indices, size_t mSize, size_t nbBlocks){
+    if (threadIdx.x == 0 && blockIdx.x == 0)
+    {
+        nc = nc + (mSize*(ncBlk_indices[nbBlocks -1]+1) + offset_indices[nbBlocks - 1]*ncBlk_indices[nbBlocks - 1]) + offsets[nbBlocks-1];
+    }
+    
+}
+
+void ncblkCopy_fast(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize, size_t *final_nc){
+    uint64_t *cBlk_indices, *ncBlk_indices;
+    uint64_t *offset_indices;
+    TimingGPU timer2;
+
+    // timer2.StartCounter();
+    
+    checkCudaErrors(cudaMalloc(&cBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&ncBlk_indices, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&offset_indices, sizeof(uint64_t)*nbBlocks));
+
+    generateFlags<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, cBlk_indices, ncBlk_indices, offset_indices, offsets, nbBlocks);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, cBlk_indices, cBlk_indices + nbBlocks, cBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, ncBlk_indices, ncBlk_indices + nbBlocks, ncBlk_indices, 0);
+    thrust::exclusive_scan(thrust::device, offset_indices, offset_indices + nbBlocks, offset_indices, 0);
+
+    nccopy_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices,final_nc);
+    // nccopy_kernel2<<<1,1>>>(c, o, nc, midBytes, meta, nbBlocks, blockSize, offsets, mSize, cBlk_indices,ncBlk_indices,offset_indices);
+
+    cudaDeviceSynchronize();
+
+    //printf("nc: %p\n", nc);
+    // printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+    // set_nc<<<1,1>>>(nc, offsets, offset_indices, ncBlk_indices, mSize, nbBlocks);
+    // cudaDeviceSynchronize();
+    // printf("ncblockcpy: %f ms\n", timer2.GetCounter());
+    checkCudaErrors(cudaFree(cBlk_indices));
+    checkCudaErrors(cudaFree(ncBlk_indices));
+    checkCudaErrors(cudaFree(offset_indices));
+}
+
+void ncblkCopy_h(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize){
+    unsigned char *tmp_states;
+    unsigned char *ncold = nc;
+    uint64_t col_off = 0;
+    short *tmp_offsets;
+    tmp_offsets = (short*)malloc(sizeof(short)*nbBlocks);
+    tmp_states = (unsigned char *)malloc(sizeof(char)*nbBlocks);
+    checkCudaErrors(cudaMemcpy(tmp_states, meta, sizeof(char)*nbBlocks, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(tmp_offsets,offsets,sizeof(short)*nbBlocks,cudaMemcpyDeviceToHost));
+    cudaStream_t stream[3];
+    cudaStreamCreate(&stream[0]);
+    cudaStreamCreate(&stream[1]);
+    cudaStreamCreate(&stream[2]);
+
+    //printf("here\n");
+    //checkCudaErrors(cudaMemcpy((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    for (int i = 0; i < nbBlocks; i++)
+    {
+        if(tmp_states[i]==3){
+            // shortToBytes_d(o, offsets[i]);
+            // buf = (unsigned char*)
+            
+//	    printf("here2\n");
+            cudaMemcpyAsync(o, offsets+i, 2, cudaMemcpyDeviceToDevice, stream[0]);
+            o += sizeof(short);
+        
+    //	    printf("here2.1\n");
+            // printf("offsets %ld\n", col_off);
+            cudaMemcpyAsync(nc, meta+(nbBlocks+i*mSize), mSize, cudaMemcpyDeviceToDevice, stream[1]);
+                // memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+                
+            nc += mSize; 
+                
+    //	    printf("here2.2\n");
+            //checkCudaErrors(cudaMemcpy(buf, offsets+i, sizeof(short), cudaMemcpyDeviceToHost));
+                
+    //	    //printf("here2.3 %d\n", buf);
+            cudaMemcpyAsync(nc, midBytes+(i*blockSize*sizeof(float)), (int)tmp_offsets[i], cudaMemcpyDeviceToDevice, stream[2]);
+            // memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += tmp_offsets[i];
+            col_off+=tmp_offsets[i];
+       
+///	    printf("here2.4\n");
+       	}
+    }
+    cudaStreamDestroy(stream[0]);
+    cudaStreamDestroy(stream[1]);
+    cudaStreamDestroy(stream[2]);
+
+    free(tmp_states);
+    free(tmp_offsets); 
+}
+
+__global__ void ncblkCopy(unsigned char * c, unsigned char* o, unsigned char *nc, unsigned char* midBytes, unsigned char* meta,
+                        size_t nbBlocks, int blockSize, short *offsets, size_t mSize)
+{
+    for (int i=blockDim.x*blockIdx.x + threadIdx.x; i<nbBlocks; i+=blockDim.x*gridDim.x){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+	    // float g;
+	    // memcpy(&g, (meta+(nbBlocks+i*mSize)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+        }else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+}
+
+size_t better_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig){
+    /**
+     * outSize: host pointer
+     * float *oriData: device pointer
+     * unsigned char* meta: device pointer
+     * short *offsets: device pointer
+     * 
+     * 
+     */
+    int out_size_h = 0;
+    int *out_size_d;
+    int tmp_outsize = 0;
+    size_t *nc_diff;
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size_h += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size_h += nbBlocks/8;
+    else
+        out_size_h += nbBlocks/8+1;
+    cudaMalloc(&nc_diff, sizeof(size_t));
+    int *nonconstant_d, nonconstant_h;
+    checkCudaErrors(cudaMalloc((void **)&nonconstant_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&out_size_d, sizeof(int)));
+
+    checkCudaErrors(cudaMemset(nonconstant_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(out_size_d, 0, sizeof(int)));
+
+
+    getNumNonConstantBlocks<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, offsets, meta, blockSize, nonconstant_d, out_size_d);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(&nonconstant_h, nonconstant_d, sizeof(int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&tmp_outsize, out_size_d, sizeof(int), cudaMemcpyDeviceToHost));
+
+    nbConstantBlocks = nbBlocks - nonconstant_h;
+    out_size_h+=tmp_outsize;
+
+    out_size_h += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+    unsigned char* r_old = outBytes;
+    // cudaDeviceSynchronize(); printf("%s\n",cudaGetLastError());
+    checkCudaErrors(cudaMemset(r, SZx_VER_MAJOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+1, SZx_VER_MINOR, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+2, 1, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+3, 0, sizeof(char)));
+    checkCudaErrors(cudaMemset(r+4, blockSize, sizeof(char)));
+
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_memset(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+    longToBytes_bigEndian_memset(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+    size_t out_length;
+
+    if(nbBlocks%4==0)
+		out_length = nbBlocks/4;
+	else
+		out_length = nbBlocks/4+1;
+
+    convert_state_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(meta, nbBlocks, r, out_length);
+    r+=out_length;
+    convert_block2_to_out_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r += nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+
+    checkCudaErrors(cudaMemcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    // memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    // ncblkCopy<<<1,1>>>(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    
+    // ncblkCopy_h(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize);
+    ncblkCopy_fast(c, o, nc, midBytes, meta,nbBlocks, blockSize, offsets, mSize, nc_diff);
+    // cudaDeviceSynchronize();
+    size_t h_nc_diff;
+    cudaMemcpy(&h_nc_diff,nc_diff, sizeof(size_t),cudaMemcpyDeviceToHost);
+    return (size_t) (nc+h_nc_diff-r_old);
+    // checkCudaErrors(cudaMemcpy(outSize, (size_t)(nc-r_old), sizeof(size_t)));
+    // *outSize = (size_t) (nc-r_old);
+    // return outBytes;
+}
+
+__global__ void device_post_proc(size_t *outSize, float *oriData, unsigned char *meta, 
+                                short *offsets, unsigned char *midBytes, unsigned char *outBytes, 
+                                size_t nbEle, int blockSize, uint64_t num_sig, uint32_t *blk_idx, 
+                                float *blk_vals, uint8_t *blk_subidx, uint8_t *blk_sig)
+{
+    int out_size = 0;
+
+    size_t nbConstantBlocks = 0;
+    size_t nbBlocks = nbEle/blockSize;
+    size_t ncBytes = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    out_size += 5+sizeof(size_t)+sizeof(float)*nbBlocks;
+    if (nbBlocks%8==0)
+        out_size += nbBlocks/8;
+    else
+        out_size += nbBlocks/8+1;
+    int s0 = 0;
+    int s1 = 0;
+    int s2 = 0;
+    int s3 = 0;
+    for (int i=0; i<nbBlocks; i++){
+        if (meta[i]==0 || meta[i]==1 || meta[i] == 2) nbConstantBlocks++;
+        else out_size += 1+(blockSize/4)+offsets[i];
+    
+    	if(meta[i]==0) s0++;
+    	if(meta[i]==1) s1++;
+    	if(meta[i]==2) s2++;
+    	if(meta[i]==3) s3++;
+    }
+  //  printf("%d %d %d %d\n", s0, s1, s2, s3);
+    out_size += (nbBlocks-nbConstantBlocks)*sizeof(short)+(nbEle%blockSize)*sizeof(float);
+
+    //outBytes = (unsigned char*)malloc(out_size);
+	unsigned char* r = outBytes;
+   // printf("outbytes %p\n",r);
+    unsigned char* r_old = outBytes; 
+	r[0] = SZx_VER_MAJOR;
+	r[1] = SZx_VER_MINOR;
+	r[2] = 1;
+	r[3] = 0; // indicates this is not a random access version
+	r[4] = (unsigned char)blockSize;
+	r=r+5; //1 byte
+	//sizeToBytes(r, nbConstantBlocks);
+    longToBytes_bigEndian_d(r, nbConstantBlocks);
+	r += sizeof(size_t);
+    //sizeToBytes(r, (size_t) num_sig);
+
+   // printf("outbytes %p\n",r);
+    longToBytes_bigEndian_d(r, (unsigned long)num_sig);
+    r += sizeof(size_t); 
+	r += convert_state_to_out(meta, nbBlocks, r);
+   // printf("num sig %d\n", num_sig); 
+   // printf("outbytes %p\n",r);
+    r += convert_block2_to_out(r, nbBlocks,num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    
+   // printf("outbytes %p\n",r);
+    memcpy(r, oriData+nbBlocks*blockSize, (nbEle%blockSize)*sizeof(float));
+    r += (nbEle%blockSize)*sizeof(float);
+
+   // printf("outbytes %p\n",r);
+    unsigned char* c = r;
+    unsigned char* o = c+nbConstantBlocks*sizeof(float);
+    unsigned char* nc = o+(nbBlocks-nbConstantBlocks)*sizeof(short);
+    for (int i=0; i<nbBlocks; i++){
+        
+        if (meta[i]==0 || meta[i] == 1){
+            memcpy(c, meta+(nbBlocks+i*mSize), sizeof(float));
+            c += sizeof(float);
+       
+	    // float g;
+	    // memcpy(&g, (c-sizeof(float)),sizeof(float));
+	    // printf("%d %f\n",i,g);
+       	}else if(meta[i] == 3){
+           shortToBytes_d(o, offsets[i]);
+            o += sizeof(short);
+            memcpy(nc, meta+(nbBlocks+i*mSize), mSize);
+            nc += mSize; 
+            memcpy(nc, midBytes+(i*blockSize*sizeof(float)), offsets[i]);
+            nc += offsets[i];
+        } 
+    }
+
+    // return out_size;
+    *outSize = (size_t) (nc-r_old);
+   // printf("outBytes 0 %d\n", (int) outBytes[0]);
+    // return (uint32_t) (nc-r_old);
+}
+
+__global__ void fin_copy(unsigned char* in, unsigned char *out, size_t n){
+
+	for(size_t i = threadIdx.x+blockIdx.x*gridDim.x; i < n; i+=blockDim.x*gridDim.x){
+		out[i]=in[i];
+	}
+
+}
+
+unsigned char* device_ptr_cuSZx_compress_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold)
+{
+    /**
+     * Assuming the following are device pointers:
+     *  float *oriData
+     *  size_t *outSize
+     *  unsigned char* outBytes
+     * 
+     */
+    // float *dmin,*dmax, *hmin, *hmax;
+    // void *d_temp_storage = NULL;
+    // size_t temp_storage_bytes = 0;
+    timer_GPU.StartCounter();
+//     cudaMalloc(&dmin, sizeof(float));
+//     cudaMalloc(&dmax, sizeof(float));
+
+//    // dmax = thrust::reduce(oriData, oriData+nbEle, -1, thrust::maximum<float>());
+//    // dmin = thrust::reduce(oriData, oriData+nbEle, 1, thrust::minimum<float>());
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, oriData, dmax, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+//     cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//     cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, oriData, dmin, nbEle);
+
+//     cudaFree(d_temp_storage);
+//     // thrust::pair<float *, float *> result = thrust::minmax_element(thrust::device, oriData,oriData+nbEle);
+//     //printf("here\n");
+//     cudaMemcpy(hmin, dmin, sizeof(float), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(hmax, dmax,sizeof(float), cudaMemcpyDeviceToHost);
+//     absErrBound = absErrBound*(hmax-hmin);
+//     threshold = threshold*(hmax-hmin);
+    // // printf("%f\n",absErrBound);
+    // cudaFree(dmin);
+    // cudaFree(dmax);
+    float sparsity_level = SPARSITY_LEVEL;
+
+    // Set the input data as the function parameter, this should be a device pointer
+
+	float* d_oriData = oriData;
+    // cudaMalloc((void**)&d_oriData, sizeof(float)*nbEle); 
+    // cudaMemcpy(d_oriData, oriData, sizeof(float)*nbEle, cudaMemcpyHostToDevice); 
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+
+    size_t ncBytes = blockSize/4;
+    //ncBytes = (blockSize+1)%4==0 ? ncBytes : ncBytes+1; //Bytes to store one non-constant block data.
+    size_t mSize = sizeof(float)+1+ncBytes; //Number of bytes for each data block's metadata.
+    size_t msz = (1+mSize) * nbBlocks * sizeof(unsigned char);
+    size_t mbsz = sizeof(float) * nbEle * sizeof(unsigned char);
+
+    // These are host pointers and do not need to be allocated
+
+    // unsigned char *meta = (unsigned char*)malloc(msz);
+    // short *offsets = (short*)malloc(nbBlocks*sizeof(short));
+    // unsigned char *midBytes = (unsigned char*)malloc(mbsz);
+
+	unsigned char* d_meta;
+	unsigned char* d_midBytes;
+	short* d_offsets;
+
+    uint32_t *blk_idx, *d_blk_idx;
+    uint8_t *blk_sig, *d_blk_sig;
+    uint8_t *blk_subidx, *d_blk_subidx;
+    float *blk_vals, *d_blk_vals;
+    uint64_t *num_sig, *d_num_sig;
+
+    checkCudaErrors(cudaMalloc((void **)&d_num_sig, sizeof(uint64_t)));
+    num_sig = (uint64_t *)malloc(sizeof(uint64_t));
+    checkCudaErrors(cudaMalloc((void **)&d_blk_idx, nbBlocks*sizeof(uint32_t)));
+    // blk_idx = malloc()
+    checkCudaErrors(cudaMalloc((void **)&d_blk_subidx, nbEle*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_vals, nbEle*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void **)&d_blk_sig, nbBlocks*sizeof(uint8_t)));
+
+    checkCudaErrors(cudaMalloc((void**)&d_meta, msz)); 
+    //checkCudaErrors(cudaMemcpy(d_meta, meta, msz, cudaMemcpyHostToDevice)); 
+    checkCudaErrors(cudaMemset(d_meta, 0, msz));
+    checkCudaErrors(cudaMalloc((void**)&d_offsets, nbBlocks*sizeof(short))); 
+    checkCudaErrors(cudaMemset(d_offsets, 0, nbBlocks*sizeof(short)));
+    checkCudaErrors(cudaMalloc((void**)&d_midBytes, mbsz)); 
+    checkCudaErrors(cudaMemset(d_midBytes, 0, mbsz));
+
+    
+    // apply_threshold<<<80,256>>>(d_oriData, threshold, nbEle);
+    // cudaDeviceSynchronize();
+    dim3 dimBlock(32, blockSize/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = blockSize * sizeof(float) + dimBlock.y * sizeof(int);
+    //printf("Malloc end timestamp: %f ms\n", timer_GPU.GetCounter());
+    compress_float<<<dimGrid, dimBlock, sMemsize>>>(d_oriData, d_meta, d_offsets, d_midBytes, absErrBound, blockSize, nbBlocks, mSize, sparsity_level, d_blk_idx, d_blk_subidx,d_blk_vals, threshold, d_blk_sig);
+    cudaError_t err = cudaGetLastError();        // Get error code
+   // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU compression timestamp: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+    get_numsig<<<1,1>>>(d_num_sig);
+    cudaDeviceSynchronize();
+
+    checkCudaErrors(cudaMemcpy(num_sig, d_num_sig, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    // These are allocations and memcpys to host pointers, do not need them
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((*num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((*num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    // checkCudaErrors(cudaMemcpy(meta, d_meta, msz, cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(offsets, d_offsets, nbBlocks*sizeof(short), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(midBytes, d_midBytes, mbsz, cudaMemcpyDeviceToHost)); 
+    
+    
+    // checkCudaErrors(cudaMemcpy(blk_idx, d_blk_idx, nbBlocks*sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_vals,d_blk_vals, (*num_sig)*sizeof(float), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_subidx,d_blk_subidx, (*num_sig)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(blk_sig,d_blk_sig, (nbBlocks)*sizeof(uint8_t), cudaMemcpyDeviceToHost));
+
+
+    size_t maxPreservedBufferSize = sizeof(float)*nbEle;
+    unsigned char *d_outBytes;
+    // unsigned char* outBytes = (unsigned char*)malloc(maxPreservedBufferSize);
+    // memset(outBytes, 0, maxPreservedBufferSize);
+    checkCudaErrors(cudaMalloc(&d_outBytes, maxPreservedBufferSize));
+
+    size_t *d_outSize;
+
+    checkCudaErrors(cudaMalloc(&d_outSize, sizeof(size_t)));
+
+  //  device_post_proc<<<1,1>>>(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    *outSize = better_post_proc(d_outSize, d_oriData, d_meta, d_offsets, d_midBytes, d_outBytes, nbEle, blockSize, *num_sig, d_blk_idx, d_blk_vals, d_blk_subidx, d_blk_sig);
+    //cudaDeviceSynchronize();
+    
+    //checkCudaErrors(cudaMemcpy(outSize, d_outSize, sizeof(size_t), cudaMemcpyDeviceToHost));
+
+    // printf("completed compression\n");
+    //free(blk_idx);
+    //free(blk_subidx);
+    //free(blk_vals);
+    // free(meta);
+    // free(offsets);
+    // free(midBytes);
+    checkCudaErrors(cudaFree(d_num_sig));
+    checkCudaErrors(cudaFree(d_blk_idx));
+    checkCudaErrors(cudaFree(d_blk_subidx));
+    checkCudaErrors(cudaFree(d_blk_vals));
+    checkCudaErrors(cudaFree(d_blk_sig));
+
+    checkCudaErrors(cudaFree(d_meta));
+    checkCudaErrors(cudaFree(d_offsets));
+    checkCudaErrors(cudaFree(d_midBytes));
+
+    unsigned char *d_newout;
+    
+    *outSize = *outSize;
+    size_t os = *outSize;
+    
+    checkCudaErrors(cudaMalloc(&d_newout, os));
+    //fin_copy<<<40,256>>>(d_outBytes, d_newout,os);
+    checkCudaErrors(cudaMemcpy(d_newout, d_outBytes, os, cudaMemcpyDeviceToDevice));
+    cudaDeviceSynchronize(); 
+
+    checkCudaErrors(cudaFree(d_outBytes));
+    printf("Compression end timestamp: %f ms\n", timer_GPU.GetCounter());
+     
+    err = cudaGetLastError();        // Get error code
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    return d_newout;
+    //return d_outBytes;
+}
+
+__device__ inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+__device__ inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+__device__ inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+__global__ void decompress_get_stats(float *newData, size_t nbEle, unsigned char* cmpBytes, 
+    size_t *numSigValues, int *bs,
+    size_t *numConstantBlks, size_t *numBlks,
+    size_t *mSizeptr, unsigned char *newCmpBytes
+){
+	unsigned char* r = cmpBytes;
+
+    size_t num_sig;
+	r += 4;
+	int blockSize = (int) r[0];  //get block size
+	
+	if(blockSize == 0)blockSize = 256;
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+	num_sig = bytesToSize(r);
+    
+    r += sizeof(size_t);
+	size_t nbBlocks = nbEle/blockSize;
+    size_t ncBlocks = 0;
+    size_t num_state2_blks = 0;
+	// size_t ncBlocks = nbBlocks - nbConstantBlocks; //get number of constant blocks
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t ncLeading = blockSize/4;
+    size_t mSize = sizeof(float)+1+ncLeading; //Number of bytes for each data block's metadata.
+
+    *mSizeptr = mSize;
+
+    *numConstantBlks = nbConstantBlocks;
+    *numBlks = nbBlocks;
+    *numSigValues = num_sig;
+    *bs = blockSize;
+    newCmpBytes = r;
+
+}
+
+ void setup_data_stateArray_better(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+
+    //printf("ma\n");
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    int ncBlocks, *ncBlocks_d;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    int num_state2_blks, *num_state2_d;
+    checkCudaErrors(cudaMalloc((void **)&num_state2_d, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&ncBlocks_d, sizeof(int)));
+    checkCudaErrors(cudaMemset(num_state2_d, 0, sizeof(int)));
+    checkCudaErrors(cudaMemset(ncBlocks_d, 0, sizeof(int)));
+
+    //printf("ma2\n");
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    convert_out_to_state_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks,r,stateArray,stateNBBytes,
+                            num_state2_d, ncBlocks_d);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	cudaDeviceSynchronize();
+    
+    //printf("ma3\n");
+	r += stateNBBytes;
+    newR = r;
+    cudaMemcpy(&ncBlocks, ncBlocks_d, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    //printf("ma4\n");
+    *ncBlks = ncBlocks;
+
+    //printf("ma4\n");
+ }
+
+__global__ void setup_data_stateArray(float *newData, size_t nbEle, unsigned char* r, 
+    size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t *ncBlks,
+    unsigned char *stateArray, unsigned char *newR
+){
+    // blockSize = 256;
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    size_t ncBlocks = 0;
+	size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    size_t num_state2_blks = 0;
+//	printf("Converting state array\n");
+    // printf("cmp %d\n", (int)r[0]);
+    // printf("state %d\n", (int)stateArray[0]);
+    convert_out_to_state(nbBlocks, r, stateArray);
+    // convert_out_to_state_kernel<<<40,256>>>(nbBlocks,r,stateArray,stateNBBytes);
+    // printf("state %d\n", (int)stateArray[0]);
+    // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	for (size_t i = 0; i < nbBlocks; i++)
+    {
+        if (stateArray[i] == 2)
+        {
+            num_state2_blks++;
+        }else if(stateArray[i] == 3){
+            ncBlocks++;
+        }
+    }
+    
+	r += stateNBBytes;
+    newR = r;
+    *ncBlks = ncBlocks;
+}
+
+__global__ void decomp_startup_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+    int i = 0, j = 0, k = 0;
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=fr[0];
+    tmp_r[1]=fr[1];
+    tmp_r[2]=fr[2];
+    tmp_r[3]=fr[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+// nbConstantBlocks
+    for(i = blockDim.x*blockIdx.x + threadIdx.x; i < nbConstantBlocks; i += blockDim.x*gridDim.x){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=fr[4*i];
+    	    tmp_r[1]=fr[4*i+1];
+    	    tmp_r[2]=fr[4*i+2];
+    	    tmp_r[3]=fr[4*i+3];
+	    float tmp = ((float*)tmp_r)[0];
+	    constantMedianArray[i] = tmp;
+	    //printf("%d %f\n", i, tmp);
+    }
+   
+
+/** PROBLEM AREA, CAN FIX WITH PARALLELIZATION BUT WATCH *FR and *P **/
+
+    // if(threadIdx.x==0 && blockIdx.x==0){
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+    for(i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        g_leng[i] = (uint64_t)leng;
+        // fr += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+        // memcpy(data+i*blockSize*sizeof(float), p, leng);
+
+        // p += leng;
+    }
+    
+    // }
+}
+
+__global__ void decompress_ncblk_kernel(unsigned char* r, size_t nbConstantBlocks, 
+unsigned char *data, int blockSize, size_t mSize, size_t ncBlocks, float *constantMedianArray, uint64_t* g_leng){
+    unsigned char * fr = r;
+    fr += nbConstantBlocks*sizeof(float);
+    unsigned char* p = fr + ncBlocks * sizeof(short);
+    unsigned char* basefr = fr;
+    unsigned char* basep = p;
+
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < ncBlocks;i+=blockDim.x*gridDim.x){
+        fr = basefr+(sizeof(short)*i);
+        int leng = (int)bytesToShort(fr)+mSize;
+        
+	
+	// g_leng[i] = leng;
+        // // fr += sizeof(short);
+        // if (leng > blockSize*sizeof(float))
+        // {
+        //     printf("Warning: compressed block is larger than the original block!\n");
+        //     return;
+        //     // exit(0);
+        // }
+        p = basep + g_leng[i];
+
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+	
+        // p += leng;
+    }
+}
+
+void decompress_startup_better(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    uint64_t* g_leng;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+
+    r += stateNBBytes;
+
+    convert_out_to_block2_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    size_t to_add = nbBlocks*4 + num_sig*sizeof(float) + num_sig*sizeof(uint8_t) + nbBlocks*sizeof(uint8_t);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    checkCudaErrors(cudaMemcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float), cudaMemcpyDeviceToDevice));
+    checkCudaErrors(cudaMalloc(&g_leng, sizeof(uint64_t)*ncBlocks));
+    // memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    decomp_startup_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks,data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+
+    thrust::exclusive_scan(thrust::device, g_leng, g_leng + ncBlocks, g_leng, 0);
+
+    decompress_ncblk_kernel<<<BLOCKS,THREADS_PER_BLOCK>>>(r, nbConstantBlocks, data, blockSize, mSize, ncBlocks, constantMedianArray, g_leng);
+    cudaDeviceSynchronize();
+    
+    // cudaError_t err = cudaGetLastError();        // Get error code
+    
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    cudaFree(g_leng);
+        
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    r += nbConstantBlocks*sizeof(float);
+
+    newCmpBytes = r;
+
+}
+
+__global__ void decompress_startup(float *newData, size_t nbEle, unsigned char* r, 
+    uint32_t *blk_idx, uint8_t *blk_subidx, uint8_t *blk_sig,
+    float *blk_vals, size_t num_sig, int blockSize,
+    size_t nbConstantBlocks, size_t nbBlocks, size_t ncBlocks,
+    unsigned char *stateArray, float* constantMedianArray, unsigned char *data,
+    size_t mSize, unsigned char *newCmpBytes
+){
+    // blockSize = 256;
+    size_t nb_tmp = (int) nbEle/blockSize;
+    /**
+     * Structures to return:
+     * blk_idx, blk_subidx, blk_sig, blk_vals, numSigValues (pointer)
+     * bs (pointer to blockSize), numConstantBlks (pointer), numBlks (pointer)
+     * ncBlks (pointer), stateArray, constantMedianArray
+     */
+	
+    // size_t ncBlocks = 0;
+	// size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+    // size_t num_state2_blks = 0;
+	// printf("Converting state array\n");
+    // convert_out_to_state(nbBlocks, r, stateArray);
+    // printf("state %d\n", (int)stateArray[0]);
+    // // convertByteArray2IntArray_fast_1b_args(nbBlocks, r, stateNBBytes, stateArray); //get the stateArray
+	// for (size_t i = 0; i < nbBlocks; i++)
+    // {
+    //     if (stateArray[i] == 2)
+    //     {
+    //         num_state2_blks++;
+    //     }else if(stateArray[i] == 3){
+    //         ncBlocks++;
+    //     }
+    // }
+   // size_t stateNBBytes = nbBlocks%4==0 ? nbBlocks/4 : nbBlocks/4+1;
+
+    size_t stateNBBytes = nb_tmp%4==0 ? nb_tmp/4 : nb_tmp/4+1;
+    //printf("%p\n", r);
+    r += 4;
+    r++;
+    r += sizeof(size_t);
+    r += sizeof(size_t);
+    //printf("statenb %d %d\n", stateNBBytes, nb_tmp);
+    r += stateNBBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+   // printf("converting block vals %d\n", data[0]);
+    size_t to_add = convert_out_to_block2(r, nbBlocks, (uint64_t)num_sig, blk_idx, blk_vals, blk_subidx, blk_sig);
+    r+= to_add;
+
+    size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+    
+    // printf("before mallocs in kernel\n");
+    
+    memcpy((newData)+nbBlocks*blockSize, r, (nbEle%blockSize)*sizeof(float));
+
+    //printf("before mallocs in kernel %p\n", r);
+    r += (nbEle%blockSize)*sizeof(float);
+    //printf("r: %p\n", r);
+    //printf("%d, %d, %d\n",nbEle, 256, nbEle%256);
+    unsigned char * fr = r; //fr is the starting address of constant median values.
+
+  //  printf("%p\n", r);
+    unsigned char tmp_r[4];
+    tmp_r[0]=r[0];
+    tmp_r[1]=r[1];
+    tmp_r[2]=r[2];
+    tmp_r[3]=r[3];
+
+
+//    printf("nbconstant: %f\n", ((float*)tmp_r)[0]);
+    for(i = 0;i < nbConstantBlocks;i++, j+=4){ //get the median values for constant-value blocks
+	    
+    	    tmp_r[0]=r[j];
+    	    tmp_r[1]=r[j+1];
+    	    tmp_r[2]=r[j+2];
+    	    tmp_r[3]=r[j+3];
+	    float tmp = ((float*)tmp_r)[0];
+//	    printf("median: %f\n", tmp);	
+	    constantMedianArray[i] = tmp;
+
+	    // printf("%d %f\n", i, tmp);
+    }
+    //printf("after constantmedian\n");
+    r += nbConstantBlocks*sizeof(float);
+    unsigned char* p = r + ncBlocks * sizeof(short);
+    for(i = 0;i < ncBlocks;i++){
+        int leng = (int)bytesToShort(r)+mSize;
+        r += sizeof(short);
+        if (leng > blockSize*sizeof(float))
+        {
+            printf("Warning: compressed block is larger than the original block!\n");
+            return;
+            // exit(0);
+        }
+//	printf("before memcpy\n");
+        memcpy(data+i*blockSize*sizeof(float), p, leng);
+  //      printf("after memcpy\n");
+	p += leng;
+    } 
+
+    newCmpBytes = r;
+//    printf("before mallocs in kernel\n");
+
+    // printf("nb blocks: %d\n", nbBlocks);
+}
+
+__global__ void cBlkCopy_decompress(int nb, float* constantMedianArray, float *newData, int blockSize, int i){
+    int j;
+    float Median = constantMedianArray[nb];
+    // j = threadIdx.x; j < blockSize; j += blockDim.x
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = Median;
+}
+
+__global__ void ncBlkCopy_decompress(int blockSize, float *newData, int nc, float *fdata, int i){
+    int j;
+    for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+        *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+}
+
+void decompress_post_proc_better(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    //printf("h1\n");
+    for (i=0;i<nbBlocks;i++){
+        unsigned char state;
+        cudaMemcpy(&state, &stateArray[i], sizeof(char), cudaMemcpyDeviceToHost);
+
+        if (state==0 || state==1){
+            cBlkCopy_decompress<<<1,256>>>(nb, constantMedianArray, newData, blockSize, i);
+            nb++;
+        }else if(state==3){
+            ncBlkCopy_decompress<<<1,256>>>(blockSize, newData, nc, fdata, i);
+            nc++;
+        }
+    }
+    cudaDeviceSynchronize();
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+__global__ void print_newdata(float *newData, size_t nbBlocks, int blockSize){
+    for (size_t i = 0; i < nbBlocks*blockSize; i++)
+    {
+        printf("%f\n", newData[i]);
+    }
+    
+}
+
+__global__ void generateNbNc(size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray, uint64_t* nbs,  uint64_t* ncs){
+    for(int i = blockDim.x*blockIdx.x + threadIdx.x;i < nbBlocks;i+=blockDim.x*gridDim.x){
+        unsigned char state = stateArray[i];
+        if(state==0||state==1){
+            nbs[i] = 1;
+            ncs[i] = 0;
+        }else if(state==3){
+            nbs[i] = 0;
+            ncs[i] = 1;
+        }else{
+            nbs[i] = 0;
+            ncs[i] = 0;
+        }
+    }
+}
+
+__global__ void decompress_final_set(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray, uint64_t* nb, uint64_t* nc){
+    float* fdata = (float*)data;
+    for (int i = blockIdx.x;i < nbBlocks;i+=gridDim.x){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb[i]];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            // nb++;
+        }else if(stateArray[i]==3){
+            for (int j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc[i]*blockSize+j];
+            // nc++;
+        }
+        __syncthreads();
+    }
+}
+
+void decompress_post_proc_fast(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    
+    int i,j;
+    uint64_t *nb, *nc;
+    checkCudaErrors(cudaMalloc(&nb, sizeof(uint64_t)*nbBlocks));
+    checkCudaErrors(cudaMalloc(&nc, sizeof(uint64_t)*nbBlocks));
+
+    generateNbNc<<<BLOCKS,THREADS_PER_BLOCK>>>(nbBlocks, ncBlocks, stateArray, nb,nc);
+    cudaDeviceSynchronize();
+    thrust::exclusive_scan(thrust::device, nb, nb + nbBlocks, nb, 0);
+    thrust::exclusive_scan(thrust::device, nc, nc + nbBlocks, nc, 0);
+
+    decompress_final_set<<<nbBlocks,blockSize>>>(data, newData, blockSize,nbBlocks, ncBlocks, stateArray,constantMedianArray, nb, nc);
+    cudaDeviceSynchronize();
+    cudaFree(nb);
+    cudaFree(nc);
+}
+
+__global__ void decompress_post_proc(unsigned char *data, float *newData, int blockSize, 
+    size_t nbBlocks, size_t ncBlocks, unsigned char *stateArray,
+    float *constantMedianArray
+){
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    // checkCudaErrors(cudaMemcpy(*newData, d_newdata, nbBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost));
+    float* fdata = (float*)data;
+    int i,j;
+    int nb=0, nc=0;
+    // if (blockIdx.x == 0)
+    // {
+    //     for (i=0;i<nbBlocks;i++){
+    //         if (stateArray[i]==0 || stateArray[i]==1){
+    //             float Median = constantMedianArray[nb];
+    //             // if (Median>1) printf("data%i:%f\n",i, Median);
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = Median;
+    //             nb++;
+    //         }
+    //     }
+    // }else{
+    //     for (i=0;i<nbBlocks;i++){
+    //         if(stateArray[i]==3){
+    //             for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+    //                 *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+    //             nc++;
+    //         }
+    //     }
+    // }
+    
+    for (i=0;i<nbBlocks;i++){
+        if (stateArray[i]==0 || stateArray[i]==1){
+            float Median = constantMedianArray[nb];
+            // if (Median>1) printf("data%i:%f\n",i, Median);
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = Median;
+            nb++;
+        }else if(stateArray[i]==3){
+            for (j = threadIdx.x; j < blockSize; j += blockDim.x)
+                *((newData)+i*blockSize+j) = fdata[nc*blockSize+j];
+            nc++;
+        }
+    }
+
+    //for(int k = 0; k < nbBlocks*blockSize;k++){
+//	printf("%f\n", newData[k]);
+  //  }
+}
+
+float* device_ptr_cuSZx_decompress_float(size_t nbEle, unsigned char* cmpBytes)
+{
+    /**
+     * Assume the following are device pointers
+     * 
+     * unsigned char* cmpBytes
+     * float** newData
+     * 
+     */
+    
+    uint32_t *blk_idx;
+    uint8_t *blk_subidx;
+    uint8_t *blk_sig;
+    float *blk_vals, *constantMedianArray;
+    size_t *num_sig, *mSize, mSize_h, num_sig_h;
+    int *blockSize, bs;
+    size_t *nbConstantBlocks, *nbBlocks, *ncBlocks, nbBlocks_h, ncBlocks_h, nbConstantBlocks_h;
+    unsigned char *stateArray, *data;
+    float *newData;
+    timer_GPU.StartCounter();
+    unsigned char *oldCmpBytes = cmpBytes;
+	//*newData = (float*)malloc(sizeof(float)*nbEle);
+//    printf("cmpbytes check %d\n", (int)cmpBytes[0]);
+//    printf("new check %f\n", *newData[0]);
+    // printf("malloc\n");
+    checkCudaErrors(cudaMalloc((void**)&num_sig, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&blockSize, sizeof(int)));
+    checkCudaErrors(cudaMalloc((void**)&nbConstantBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&nbBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&ncBlocks, sizeof(size_t)));
+    checkCudaErrors(cudaMalloc((void**)&mSize, sizeof(size_t)));    
+    checkCudaErrors(cudaMalloc((void**)&newData, sizeof(float)*nbEle));
+
+    decompress_get_stats<<<1,1>>>(newData, nbEle, cmpBytes, 
+        num_sig, blockSize,
+        nbConstantBlocks, nbBlocks,
+        mSize, cmpBytes
+    );
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    checkCudaErrors(cudaMemcpy(&nbBlocks_h, nbBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&nbConstantBlocks_h, nbConstantBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&bs, blockSize, sizeof(int), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&mSize_h, mSize, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(&num_sig_h, num_sig, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+
+    checkCudaErrors(cudaMalloc((void**)&stateArray, nbBlocks_h));
+    checkCudaErrors(cudaMalloc((void**)&constantMedianArray, nbConstantBlocks_h*sizeof(float)));
+
+    checkCudaErrors(cudaMalloc((void**)&blk_idx, nbBlocks_h*sizeof(uint32_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_vals, num_sig_h*sizeof(float)));
+    checkCudaErrors(cudaMalloc((void**)&blk_subidx, num_sig_h*sizeof(uint8_t)));
+    checkCudaErrors(cudaMalloc((void**)&blk_sig, nbBlocks_h*sizeof(uint8_t)));
+
+    unsigned char* tmp_r = cmpBytes;
+    unsigned char* newR;
+    setup_data_stateArray_better(newData, nbEle, tmp_r, 
+    num_sig_h, bs,
+    nbConstantBlocks_h, nbBlocks_h, &ncBlocks_h,
+    stateArray, newR);
+    
+    
+    
+   // setup_data_stateArray<<<1,1>>>(newData, nbEle, cmpBytes, 
+   //      num_sig_h, bs,
+   //      nbConstantBlocks_h, nbBlocks_h, ncBlocks,
+   //      stateArray, cmpBytes
+   //  );
+   // cudaDeviceSynchronize();
+
+   // printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+   // checkCudaErrors(cudaMemcpy(&ncBlocks_h, ncBlocks, sizeof(size_t), cudaMemcpyDeviceToHost)); 
+
+    checkCudaErrors(cudaMalloc((void**)&data, ncBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    // cmpBytes = newCmpBytes;
+    // data = (unsigned char*)malloc(ncBlocks*blockSize*sizeof(float));
+    // memset(data, 0, ncBlocks*blockSize*sizeof(float));
+    // stateArray = (unsigned char*)malloc(nbBlocks);
+    
+    // // unsigned char* d_stateArray;
+    // // cudaMalloc(&d_stateArray, nbBlocks);
+	// constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));			
+
+    // blk_idx = (uint32_t *)malloc(nbBlocks*sizeof(uint32_t));
+    // blk_vals= (float *)malloc((num_sig)*sizeof(float));
+    // blk_subidx = (uint8_t *)malloc((num_sig)*sizeof(uint8_t));
+    // blk_sig = (uint8_t *)malloc(nbBlocks*sizeof(uint8_t));
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+    //test_nbBlks = (size_t *)malloc(sizeof(size_t));
+    // printf("malloc\n");
+    
+    
+    tmp_r = cmpBytes;
+    decompress_startup_better(newData, nbEle, tmp_r, 
+    blk_idx, blk_subidx, blk_sig,
+    blk_vals, num_sig_h, bs,
+     nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    stateArray, constantMedianArray, data,
+    mSize_h, newR);
+
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA start Error: %s\n", cudaGetErrorString(err));
+    //decompress_startup<<<1,1>>>(newData, nbEle, cmpBytes, 
+    // blk_idx, blk_subidx, blk_sig,
+    // blk_vals, num_sig_h, bs,
+    // nbConstantBlocks_h, nbBlocks_h, ncBlocks_h,
+    // stateArray, constantMedianArray, data, mSize_h, cmpBytes);
+    //cudaDeviceSynchronize();
+    // cmpBytes = newCmpBytes;
+
+    //printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
+
+    // unsigned char* d_data;
+    float *d_newdata;
+    // checkCudaErrors(cudaMalloc((void**)&d_data, ncBlocks*blockSize*sizeof(float))); 
+    // checkCudaErrors(cudaMemcpy(d_data, data, ncBlocks*blockSize*sizeof(float), cudaMemcpyHostToDevice)); 
+    // printf("nblocks: %d bs: %d ncblock %d\n", nbBlocks_h, bs, ncBlocks_h);
+    checkCudaErrors(cudaMalloc(&d_newdata, nbBlocks_h*bs*sizeof(float)));
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    dim3 dimBlock(32, bs/32);
+    dim3 dimGrid(65536, 1);
+    const int sMemsize = bs * sizeof(float) + dimBlock.y * sizeof(int);
+    decompress_state2<<<nbBlocks_h, 64>>>(d_newdata, stateArray,blk_idx, blk_vals, blk_subidx, bs, blk_sig);
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    decompress_float<<<dimGrid, dimBlock, sMemsize>>>(data, bs, ncBlocks_h, mSize_h);
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+    cudaDeviceSynchronize();
+
+    // err = cudaGetLastError();        // Get error code
+    // printf("CUDA dec main Error: %s\n", cudaGetErrorString(err));
+    
+    // checkCudaErrors(cudaMemcpy(data, d_data, ncBlocks*blockSize*sizeof(float), cudaMemcpyDeviceToHost)); 
+    checkCudaErrors(cudaMemcpy(newData, d_newdata, nbBlocks_h*bs*sizeof(float), cudaMemcpyDeviceToDevice));
+    cudaFree(d_newdata);
+
+    // decompress_post_proc<<<1,1>>>(data, newData, bs, 
+    // nbBlocks_h, ncBlocks_h, stateArray,
+    // constantMedianArray);
+    // cudaDeviceSynchronize();
+    decompress_post_proc_fast(data, newData, bs, 
+    nbBlocks_h, ncBlocks_h, stateArray,
+    constantMedianArray);
+    err = cudaGetLastError();        // Get error code
+    //printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    //printf("GPU decompression timing: %f ms\n", timer_GPU.GetCounter());
+   // print_newdata<<<1,1>>>(newData, nbBlocks_h, bs);
+	cudaFree(stateArray);
+	cudaFree(constantMedianArray);
+	cudaFree(data);
+    cudaFree(blk_idx);
+    cudaFree(blk_subidx);
+    cudaFree(blk_vals);
+    cudaFree(blk_sig);
+    return newData;
+
+}
+
diff --git a/qtensor/compression/szx/src/cuszx_float.cu b/qtensor/compression/szx/src/cuszx_float.cu
new file mode 100644
index 00000000..da6022f1
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_float.cu
@@ -0,0 +1,393 @@
+#include <stdio.h>
+#include <math.h>
+#include "cuszx_float.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+#define MAX_BLK_SIZE 256
+
+__device__ uint32_t num_state2;
+__device__ uint64_t total_sig;
+
+__device__
+void gridReduction_cg(double *results) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int bid = blockIdx.x;
+
+    if (bid==0){
+        double data = results[tidy*gridDim.x+tidx];
+
+        for (int i=(tidx+blockDim.x); i<gridDim.x; i+=blockDim.x){
+            if (tidy<2) data = min(data, results[tidy*gridDim.x+i]);
+            else if (tidy<4) data = max(data, results[tidy*gridDim.x+i]);
+            else data += results[tidy*gridDim.x+i];
+        }
+        __syncthreads();                  
+
+        for (int offset = warpSize/2; offset > 0; offset /= 2) 
+        {
+            if (tidy<2) data = min(data, __shfl_xor_sync(FULL_MASK, data, offset));
+            else if (tidy<4) data = max(data, __shfl_xor_sync(FULL_MASK, data, offset));
+            else data += __shfl_down_sync(FULL_MASK, data, offset);
+        }
+
+        if (tidx==0) results[tidy*gridDim.x] = data;
+    }
+}
+
+__device__ void _IntArray2ByteArray(int leadingNum, int mbase, unsigned char* meta)
+{
+    leadingNum = leadingNum << (3-threadIdx.x%4)*2;
+    for (int i = 1; i < 4; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        leadingNum |= __shfl_down_sync(mask, leadingNum, i);
+    }
+
+    if (threadIdx.x%4==0)
+        meta[mbase+threadIdx.y*8+threadIdx.x/4] = (unsigned char)leadingNum;
+    __syncthreads();
+
+
+}
+
+__device__ int _compute_reqLength(int redius, int absErrBound)
+{
+    int radExpo = (redius & 0x7F800000) >> 23;
+    radExpo -= 127;
+    int reqExpo = (absErrBound & 0x7F800000) >> 23;
+    reqExpo -= 127;
+    return 9+radExpo-reqExpo+1;
+}
+
+__device__ int _shfl_scan(int lznum, int *sums)
+{
+    // Below is the basic structure of using a shfl instruction
+    // for a scan.
+    // Record "value" as a variable - we accumulate it along the way
+    int value = lznum;
+
+    // Now accumulate in log steps up the chain
+    // compute sums, with another thread's value who is
+    // distance delta away (i).  Note
+    // those threads where the thread 'i' away would have
+    // been out of bounds of the warp are unaffected.  This
+    // creates the scan sum.
+
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, value, i);
+
+        if (threadIdx.x >= i) value += n;
+                      
+    }
+
+    // value now holds the scan value for the individual thread
+    // next sum the largest values for each warp
+
+    // write the sum of the warp to smem
+    if (threadIdx.x == warpSize - 1) {
+        sums[threadIdx.y] = value;
+    }
+    __syncthreads();
+
+    //
+    // scan sum the warp sums
+    // the same shfl scan operation, but performed on warp sums
+    //
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_sum = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            //int n = __shfl_up_sync(mask, warp_sum, i, blockDim.y);
+            int n = __shfl_up_sync(mask, warp_sum, i);
+            if (threadIdx.x >= i) warp_sum += n;
+        }
+
+        sums[threadIdx.x] = warp_sum;
+    }
+    __syncthreads();
+
+    // perform a uniform add across warps in the block
+    // read neighbouring warp's sum and add it to threads value
+    int blockSum = 0;
+    if (threadIdx.y > 0) {
+        blockSum = sums[threadIdx.y - 1];
+    }
+    value += blockSum;
+
+    return value;
+}
+
+__device__ void _compute_oneBlock(unsigned long bbase, int mbase, int obase, int reqLength, float *value, int *ivalue, uchar4 *cvalue, int *sums, unsigned char *meta, short *offsets, unsigned char *midBytes)
+{
+	int reqBytesLength;
+	int rightShiftBits;
+
+
+	if (reqLength%8 != 0)
+	{
+		reqBytesLength = reqLength/8+1;		
+		rightShiftBits = 8 - reqLength%8;
+    }else{
+		reqBytesLength = reqLength/8;		
+		rightShiftBits = 0;
+    }
+
+    int cur_ivalue = (ivalue[threadIdx.y*blockDim.x+threadIdx.x] >> rightShiftBits) & ((1<<(32-rightShiftBits))-1);
+    ivalue[threadIdx.y*blockDim.x+threadIdx.x] = cur_ivalue;
+    __syncthreads();                  
+
+    int pre_ivalue = 0;
+    if (threadIdx.x!=0 || threadIdx.y!=0) pre_ivalue = ivalue[threadIdx.y*blockDim.x+threadIdx.x-1];
+    pre_ivalue = cur_ivalue ^ pre_ivalue;
+    __syncthreads();                  
+
+    int leadingNum = 0;
+    if (reqBytesLength == 2)
+    {
+        if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }else if (reqBytesLength == 3)
+    {
+        if (pre_ivalue >> 8 == 0) leadingNum = 3;
+        else if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }else if (reqBytesLength == 1)
+    {
+        if (pre_ivalue >> 24 == 0) leadingNum = 1;
+
+    }else if (reqBytesLength == 4)
+    {
+        if (pre_ivalue == 0) leadingNum = 4;
+        else if (pre_ivalue >> 8 == 0) leadingNum = 3;
+        else if (pre_ivalue >> 16 == 0) leadingNum = 2;
+        else if (pre_ivalue >> 24 == 0) leadingNum = 1;
+    }
+    //midBytes[bbase+threadIdx.y*blockDim.x+threadIdx.x] = leadingNum; 
+
+    int midByte_size = reqBytesLength - leadingNum;
+    int midByte_sum = _shfl_scan(midByte_size, sums);
+    uchar4 cur_cvalue = cvalue[threadIdx.y*blockDim.x+threadIdx.x];
+    if (reqBytesLength == 2)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z;
+        }
+    }else if (reqBytesLength == 3)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.y; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.y; 
+        }else if (midByte_size == 3){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.y; 
+        }
+    }else if (reqBytesLength == 1)
+    {
+        if (midByte_size == 1)
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+    }else if (reqBytesLength == 4)
+    {
+        if (midByte_size == 1){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.x; 
+        }else if (midByte_size == 2){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.x; 
+        }else if (midByte_size == 3){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.x; 
+        }else if (midByte_size == 4){
+            midBytes[bbase+midByte_sum-1] = cur_cvalue.w; 
+            midBytes[bbase+midByte_sum-2] = cur_cvalue.z; 
+            midBytes[bbase+midByte_sum-3] = cur_cvalue.y; 
+            midBytes[bbase+midByte_sum-4] = cur_cvalue.x; 
+        }
+    }
+
+    if (threadIdx.x==0 && threadIdx.y==0) meta[mbase] = (unsigned char)reqLength;
+    if (threadIdx.x==blockDim.x-1 && threadIdx.y==blockDim.y-1) offsets[obase] = (short)midByte_sum;
+    _IntArray2ByteArray(leadingNum, mbase+1, meta);
+
+}
+
+__global__ void apply_threshold(float *data, float threshold, size_t length){
+    
+    if(threadIdx.x == 0 && blockIdx.x == 0){
+	printf("tid threshold: %f\n", threshold);
+    }
+
+    for (unsigned long tid = threadIdx.x+blockDim.x*blockIdx.x; tid < length; tid+=blockDim.x*gridDim.x)
+    {
+        if (fabs(data[tid]) <= threshold)
+        {
+            data[tid] = 0.0;
+        }
+    }
+}
+
+__global__ void compress_float(float *oriData, unsigned char *meta, short *offsets, unsigned char *midBytes, float absErrBound, int bs, size_t nb, size_t mSize, float sparsity_level, uint32_t *blk_idx, uint8_t *blk_subidx,float *blk_vals, float threshold, uint8_t *blk_sig) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int bid = blockIdx.x;
+
+    float data, radius, medianValue;
+    unsigned mask;
+    unsigned char state;
+    extern __shared__ float shared[];
+
+    __shared__ float block_vals[MAX_BLK_SIZE];
+    __shared__ uint8_t block_idxs[MAX_BLK_SIZE];
+    __shared__ int num_sig;
+    __shared__ int index;
+    float* value = shared;
+    int* ivalue = (int*)shared;
+    uchar4* cvalue = (uchar4*)shared;
+    int* sums = &ivalue[bs];
+
+    if(threadIdx.x == 0 && blockIdx.x == 0){
+	num_state2=0;
+	total_sig=0;	
+    }
+
+    for (unsigned long b=bid; b<nb; b+=gridDim.x){
+        if (tidx ==0 && tidy ==0)
+        {
+            num_sig = 0;
+        }
+        __syncthreads();
+
+
+        for (size_t i = b*bs+(tidx + blockDim.x*tidy); i < b*bs +bs; i+=blockDim.x*blockDim.y)
+        {
+            // fabs(data[tid]) <= threshold
+            float old = oriData[i];
+	    if (fabs(oriData[i]) > threshold)
+            {
+                int idx = atomicAdd(&num_sig, 1);
+                block_vals[idx] = oriData[i];
+                block_idxs[idx] = (uint8_t) (0xff & (i - (b*bs)));
+            }else{
+                oriData[i] = 0.0;
+            }
+            //if(fabs(old) > threshold && oriData[i] ==0.0){
+		//printf("something wrong\n");
+	    //}
+        }
+        __syncthreads();
+
+        data = oriData[b*bs+tidy*warpSize+tidx];
+        float Min = data;
+        float Max = data;
+
+        for (int offset = warpSize/2; offset > 0; offset /= 2) 
+        {
+            Min = min(Min, __shfl_xor_sync(FULL_MASK, Min, offset));
+            Max = max(Max, __shfl_xor_sync(FULL_MASK, Max, offset));
+        }
+        if (tidx==0){
+            value[tidy] = Min;
+            value[blockDim.y+tidy] = Max;
+        }
+        __syncthreads();                  
+
+        if (tidy==0){
+            if (tidx < blockDim.y){
+                Min = value[tidx];
+                Max = value[blockDim.y+tidx];
+            }
+
+            mask = __ballot_sync(FULL_MASK, tidx < blockDim.y);
+            for (int offset = blockDim.y/2; offset > 0; offset /= 2) 
+            {
+                Min = min(Min, __shfl_xor_sync(mask, Min, offset));
+                Max = max(Max, __shfl_xor_sync(mask, Max, offset));
+            }
+            
+            if (tidx==0){
+                radius = (Max - Min)/2;
+                value[0] = radius;
+                value[1] = Min + radius;
+                value[2] = absErrBound;
+            }
+        }
+        __syncthreads();                  
+
+        radius = value[0];
+        medianValue = value[1];
+
+        if (num_sig==0)
+        {
+            state = 1; // All zeros
+        }else if( num_sig > 0 && radius <= absErrBound){
+            state = 0; // Constant block with non zeros
+        } else if( ((float) num_sig/(float)bs) <= sparsity_level && num_sig > 0){
+            state = 2; // Do grouping, store as-is with bitmap/index
+        } else{
+            state = 3; // Do normal non-constant block
+        }
+        
+
+        // state = radius <= absErrBound ? 0 : 1;
+        if (tidx==0){
+	    
+            meta[b] = state;
+            meta[nb+b*mSize] = cvalue[1].x;
+            meta[nb+b*mSize+1] = cvalue[1].y;
+            meta[nb+b*mSize+2] = cvalue[1].z;
+            meta[nb+b*mSize+3] = cvalue[1].w;
+        } 
+        __syncthreads();                  
+        int tid = tidx + tidy*blockDim.x;
+        //if(tid == 0) printf("s %d %d\n", b, (int)state);
+	if (state==2)
+        {
+            int idx = 0;
+            if (tidx ==0 && tidy == 0)
+            {
+		//printf("level: %f\n", ((float)num_sig/(float)bs));
+                idx = atomicAdd(&num_state2, (uint32_t)num_sig);
+                blk_idx[b] = idx;    // Store the index of where this block has values and indices within block
+                blk_sig[b] = (uint8_t) 0xff & num_sig;
+            	index = idx;
+	    }
+            __syncthreads();
+	    idx = index;
+            for (int i = tid; i < num_sig; i+=blockDim.x*blockDim.y)
+            {
+                blk_vals[idx+i] = block_vals[i];   // Store the value of the significant data point in the block
+                blk_subidx[idx+i] = block_idxs[i]; // Store the byte value of index within block of significant data point
+                //printf("blk %f %f , ind %d\n", block_vals[i], block_idxs[i], idx);
+	    }
+            
+        }
+        
+
+        if (state==3){
+            int reqLength = _compute_reqLength(ivalue[0], ivalue[2]);
+            __syncthreads();                  
+            value[tidy*blockDim.x+tidx] = data - medianValue;
+            __syncthreads();                  
+            _compute_oneBlock(b*bs*sizeof(float), nb+b*mSize+4, b, reqLength, value, ivalue, cvalue, sums, meta, offsets, midBytes);
+        }
+
+    }
+
+}
+
+__global__ void get_numsig(uint64_t *num_sig){
+    *num_sig = (uint64_t)num_state2;
+}
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.cu b/qtensor/compression/szx/src/cuszx_wrapper.cu
new file mode 100644
index 00000000..b68ac8c9
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_wrapper.cu
@@ -0,0 +1,41 @@
+#include "cuszx_entry.h"
+#include "szx_defines.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include "timingGPU.h"
+
+extern "C"{
+    unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){
+        float max,min;
+        unsigned char* bytes;
+        max = data[0];
+        min = data[0];
+        for (size_t i = 0; i < nbEle; i++)
+        {
+            if(data[i] > max) max = data[i];
+            if(data[i] < min) min = data[i];
+        }
+        
+        float threshold = r2r_threshold*(max-min);
+        float errBound = r2r_err*(max-min);
+        bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold);
+   	    // printf("outSize %p\n", bytes);
+        return bytes;
+    }
+
+    float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){
+        // printf("test\n");
+        float**data;
+	    cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes);
+        return *data;
+    }
+
+    unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){
+        return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold);
+    }
+
+    float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){
+        return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes);
+    }
+    
+}
diff --git a/qtensor/compression/szx/src/cuszx_wrapper.py b/qtensor/compression/szx/src/cuszx_wrapper.py
new file mode 100644
index 00000000..11e81223
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszx_wrapper.py
@@ -0,0 +1,220 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+LIB_PATH = str(Path(__file__).parent/'libcuszx_wrapper.so')
+# LIB_PATH='/home/mkshah5/QTensor/qtensor/compression/szx/src/libcuszx_wrapper.so'
+# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize)
+
+def get_host_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_integrated_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize
+    func.argtypes = [POINTER(c_float), c_float, c_float, c_size_t, c_int, POINTER(c_size_t)]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+# float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle)
+
+def get_host_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_integrated_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [POINTER(c_ubyte), c_size_t]
+    func.restype = POINTER(c_float)
+    return func
+
+def get_device_compress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_device_compress
+    # Returns: unsigned char *bytes
+    # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold
+    func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t, c_int, c_float]
+    func.restype = POINTER(c_ubyte)
+    return func
+
+def get_device_decompress():
+    dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL)
+    func = dll.cuSZx_device_decompress
+    # Returns: float *newData
+    # Needs: size_t nbEle, unsigned char *cmpBytes
+    func.argtypes = [c_size_t, POINTER(c_ubyte)]
+    func.restype = POINTER(c_float)
+    return func
+
+
+def cuszx_host_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_host_compress = get_host_compress()
+
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData_p = ctypes.cast(oriD.data.ptr, ctypes.POINTER(c_float))
+
+    o_bytes = __cuszx_host_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
+
+    return o_bytes, outSize
+
+def cuszx_host_decompress(nbEle, cmpBytes):
+    __cuszx_host_decompress=get_host_decompress()
+
+    nbEle_p = ctypes.c_size_t(nbEle)
+    newData = __cuszx_host_decompress(nbEle_p,cmpBytes)
+    return newData
+
+
+def cuszx_device_compress(oriData, absErrBound, nbEle, blockSize,threshold):
+    __cuszx_device_compress = get_device_compress()
+    
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+    #absErrBound = absErrBound*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    #threshold = threshold*(cp.amax(oriData.get())-cp.amin(oriData.get()))
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+   # print(oriData.dtype)
+    sample = oriData[::2]
+    
+    #torch_tensor = torch.as_tensor(sample, device='cuda')
+    #d = torch.max(torch_tensor).item() - torch.min(torch_tensor).item()
+    #s_sample = cp.sort(sample)
+    #d = s_sample[-1] - s_sample[0]
+    #v_time = time.time()
+    #print(type(oriData))
+    d = cp.amax(oriData) - cp.amin(oriData)
+    #print("max min time (s): " +str(time.time()-v_time))
+    d = d.get()
+    if d.dtype == np.complex64:
+        #d = min(d.real, d.imag)
+        d = d.real
+    absErrBound = absErrBound*(d)
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    #print(cp.get_array_module(oriData))    
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    oriData = oriData[truth_values]
+    bitmap = truth_values
+    nbEle = oriData.shape[0]
+    
+
+    oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float))
+    #print("starting") 
+    o_bytes = __cuszx_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle), np.int32(blockSize),np.float32(threshold))
+  
+    #print("tg and max time (s): "+str(time.time()-s_1))
+    #print("bitmap shape: "+str(bitmap.shape[0]))
+    #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0]))
+    #print("CR")
+    print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8))
+    return (o_bytes,bitmap), outSize
+
+
+def cuszx_device_decompress(nbEle, cmpBytes, owner, dtype):
+    __cuszx_device_decompress=get_device_decompress()
+    (cmpBytes, bitmap) = cmpBytes
+    #print("bitmap len:" +str(len(bitmap)))
+    #print(nbEle)
+    tmp_nbEle = cp.count_nonzero(bitmap).item()
+    #print(tmp_nbEle)
+    nbEle_p = ctypes.c_size_t(tmp_nbEle)
+    newData = __cuszx_device_decompress(nbEle_p,cmpBytes)
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decompressed_int = p_decompressed_int.contents
+    # --
+    pointer_for_free = decompressed_int.value
+    # self.decompressed_own.append(decompressed_int.value)
+    mem = cp.cuda.UnownedMemory(decompressed_int.value, tmp_nbEle, owner, device_id=0)
+    mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    arr = cp.ndarray(shape=(tmp_nbEle,), dtype=np.float32, memptr=mem_ptr)
+
+    res = cp.zeros((nbEle,))
+    ## need to convert newData to cupy
+    cp.place(res,bitmap,arr)
+
+    c_res = cp.zeros(int(nbEle/2), np.complex64)
+    c_res.real = res[0:int(nbEle/2)]
+    c_res.imag = res[int(nbEle/2):]
+    return (c_res, pointer_for_free)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = cuszx_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= cuszx_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        free_compressed(o_bytes[0])
+        cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/szx/src/cuszxd_float.cu b/qtensor/compression/szx/src/cuszxd_float.cu
new file mode 100644
index 00000000..3edd1ee3
--- /dev/null
+++ b/qtensor/compression/szx/src/cuszxd_float.cu
@@ -0,0 +1,341 @@
+#include <stdio.h>
+#include <math.h>
+#include "cuszxd_float.h"
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+__device__ int _deshfl_scan(int lznum, int *sums)
+{
+    // Below is the basic structure of using a shfl instruction
+    // for a scan.
+    // Record "value" as a variable - we accumulate it along the way
+    int value = lznum;
+
+    // Now accumulate in log steps up the chain
+    // compute sums, with another thread's value who is
+    // distance delta away (i).  Note
+    // those threads where the thread 'i' away would have
+    // been out of bounds of the warp are unaffected.  This
+    // creates the scan sum.
+
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, value, i);
+
+        if (threadIdx.x >= i) value += n;
+                      
+    }
+
+    // value now holds the scan value for the individual thread
+    // next sum the largest values for each warp
+
+    // write the sum of the warp to smem
+    if (threadIdx.x == warpSize - 1) {
+        sums[threadIdx.y] = value;
+    }
+    __syncthreads();
+
+    //
+    // scan sum the warp sums
+    // the same shfl scan operation, but performed on warp sums
+    //
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_sum = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            //int n = __shfl_up_sync(mask, warp_sum, i, blockDim.y);
+            int n = __shfl_up_sync(mask, warp_sum, i);
+            if (threadIdx.x >= i) warp_sum += n;
+        }
+
+        sums[threadIdx.x] = warp_sum;
+    }
+    __syncthreads();
+
+    // perform a uniform add across warps in the block
+    // read neighbouring warp's sum and add it to threads value
+    int blockSum = 0;
+    if (threadIdx.y > 0) {
+        blockSum = sums[threadIdx.y - 1];
+    }
+    value += blockSum;
+
+    return value;
+}
+
+__device__ int _compareByte(int pre, int cur, int reqBytesLength)
+{
+    if (reqBytesLength == 2)
+    {
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0x000000ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0x0000ff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }else if (reqBytesLength == 3)
+    {
+        if ((pre&0x00ff0000) > (cur&0x00ff0000)){
+            cur &= 0x0000ffff;
+            cur |= (pre & 0x00ff0000);
+        }
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0x00ff00ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0x00ffff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }else if (reqBytesLength == 1)
+    {
+        if (pre > cur)
+            cur = pre;
+    }else if (reqBytesLength == 4)
+    {
+        if ((pre&0xff000000) > (cur&0xff000000)){
+            cur &= 0x00ffffff;
+            cur |= (pre & 0xff000000);
+        }
+        if ((pre&0x00ff0000) > (cur&0x00ff0000)){
+            cur &= 0xff00ffff;
+            cur |= (pre & 0x00ff0000);
+        }
+        if ((pre&0x0000ff00) > (cur&0x0000ff00)){
+            cur &= 0xffff00ff;
+            cur |= (pre & 0x0000ff00);
+        }
+        if ((pre&0x000000ff) > (cur&0x000000ff)){
+            cur &= 0xffffff00;
+            cur |= (pre & 0x000000ff);
+        }
+    }
+    return cur;
+}
+
+__device__ int _retrieve_leading(int pos, int reqBytesLength, int* sums)
+{
+#pragma unroll
+    for (int i = 1; i <= warpSize; i *= 2) {
+        unsigned int mask = 0xffffffff;
+        int n = __shfl_up_sync(mask, pos, i);
+        if (threadIdx.x >= i)
+            pos = _compareByte(n, pos, reqBytesLength);
+    }
+
+    if (threadIdx.x == warpSize - 1)
+        sums[threadIdx.y] = pos;
+    __syncthreads();
+
+    if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
+        int warp_pos = sums[threadIdx.x];
+
+        int mask = (1 << blockDim.y) - 1;
+        for (int i = 1; i <= blockDim.y; i *= 2) {
+            int n = __shfl_up_sync(mask, warp_pos, i);
+            if (threadIdx.x >= i)
+                warp_pos = _compareByte(n, warp_pos, reqBytesLength);
+        }
+
+        sums[threadIdx.x] = warp_pos;
+    }
+    __syncthreads();
+
+    if (threadIdx.y > 0) {
+        int block_pos = sums[threadIdx.y - 1];
+        pos = _compareByte(block_pos, pos, reqBytesLength);
+    }
+
+    return pos;
+}
+
+#define MAX_BLK_SIZE 256
+
+__global__ void decompress_state2(float *out, unsigned char* stateArray, uint32_t *blk_idx, float *blk_vals, uint8_t *blk_subidx,uint32_t blockSize, uint8_t *blk_sig){
+    int bid = blockIdx.x;
+    uint8_t state = stateArray[bid];
+
+    __shared__ float block_vals[MAX_BLK_SIZE];
+    __shared__ uint8_t block_subidx[MAX_BLK_SIZE];
+    // __shared__ char idx_taken[MAX_BLK_SIZE];
+    __shared__ float s_out[MAX_BLK_SIZE];
+    __shared__ int sig_count;
+    if (state != 2)
+    {
+        return;
+    }
+
+    int local_sig = blk_sig[bid];
+    int idx = blk_idx[bid];
+    
+    for (size_t i = threadIdx.x; i < local_sig; i+=blockDim.x)
+    {
+        block_vals[i] = blk_vals[idx+i];
+        block_subidx[i]=blk_subidx[idx+i];
+        // idx_taken[block_subidx[i]] = 1;
+        atomicAdd(&sig_count, 1);
+        
+    }
+    
+    __syncthreads();
+    
+    for (size_t i = threadIdx.x; i < blockSize; i+=blockDim.x)
+    {
+        s_out[i] = 0.0;
+    }
+
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < local_sig; i+=blockDim.x)
+    {
+        s_out[block_subidx[i]] = block_vals[i];
+    }
+    __syncthreads();
+    for (size_t i = threadIdx.x; i < blockSize; i+=blockDim.x)
+    {
+        out[bid*blockSize+i] = s_out[i];
+    }
+}
+
+__global__ void decompress_float(unsigned char *data, int bs, size_t nc, size_t mSize) 
+{
+    int tidx = threadIdx.x;
+    int tidy = threadIdx.y;
+    int tid = tidy*warpSize+tidx;
+    int bid = blockIdx.x;
+
+    float medianValue;
+    unsigned char leadingNum;
+    extern __shared__ float shared[];
+    float* value = shared;
+    int* ivalue = (int*)shared;
+    uchar4* c4value = (uchar4*)shared;
+    unsigned char* cvalue = (unsigned char*)shared;
+    int* sums = &ivalue[bs];
+    int reqLength;
+    float* fbytes = (float*)data;
+	int reqBytesLength;
+	int rightShiftBits;
+
+
+    bool bi = false;
+    for (int b=bid; b<nc; b+=gridDim.x){
+        bi = false;
+        if (b==26192) bi=true;
+        value[tid] = fbytes[b*bs+tid];
+        __syncthreads();                  
+        medianValue = value[0];
+        reqLength = (int)cvalue[4];
+        if (reqLength%8 != 0)
+        {
+            reqBytesLength = reqLength/8+1;		
+            rightShiftBits = 8 - reqLength%8;
+        }else{
+            reqBytesLength = reqLength/8;		
+            rightShiftBits = 0;
+        }
+        leadingNum = cvalue[5+(tid>>2)];
+        leadingNum = (leadingNum >> (6-((tid&0x03)<<1))) & 0x03;
+        int midByte_size = reqBytesLength - leadingNum;
+        int midByte_sum = _deshfl_scan(midByte_size, sums);
+
+        uchar4 tmp;
+        tmp.x = 0;
+        tmp.y = 0;
+        tmp.z = 0;
+        tmp.w = 0;
+        int pos = 0;
+        if (reqBytesLength == 2)
+        {
+            if (midByte_size == 1){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<8;
+            }else if (midByte_size == 2){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2];
+                pos |= tid;
+                pos |= tid<<8;
+            }
+        }else if (reqBytesLength == 3)
+        {
+            if (midByte_size == 1){
+                tmp.y = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<16;
+            }else if (midByte_size == 2){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                tmp.y = cvalue[mSize+midByte_sum-2]; 
+                pos |= tid<<8;
+                pos |= tid<<16;
+            }else if (midByte_size == 3){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2]; 
+                tmp.y = cvalue[mSize+midByte_sum-3]; 
+                pos |= tid;
+                pos |= tid<<8;
+                pos |= tid<<16;
+            }
+        }else if (reqBytesLength == 1)
+        {
+            if (midByte_size == 1)
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid;
+        }else if (reqBytesLength == 4)
+        {
+            if (midByte_size == 1){
+                tmp.x = cvalue[mSize+midByte_sum-1]; 
+                pos |= tid<<24;
+            }else if (midByte_size == 2){
+                tmp.y = cvalue[mSize+midByte_sum-1]; 
+                tmp.x = cvalue[mSize+midByte_sum-2]; 
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }else if (midByte_size == 3){
+                tmp.z = cvalue[mSize+midByte_sum-1]; 
+                tmp.y = cvalue[mSize+midByte_sum-2]; 
+                tmp.x = cvalue[mSize+midByte_sum-3]; 
+                pos |= tid<<8;
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }else if (midByte_size == 4){
+                tmp.w = cvalue[mSize+midByte_sum-1]; 
+                tmp.z = cvalue[mSize+midByte_sum-2]; 
+                tmp.y = cvalue[mSize+midByte_sum-3]; 
+                tmp.x = cvalue[mSize+midByte_sum-4]; 
+                pos |= tid;
+                pos |= tid<<8;
+                pos |= tid<<16;
+                pos |= tid<<24;
+            }
+        }
+        __syncthreads();                  
+        c4value[tid] = tmp;
+
+        pos = _retrieve_leading(pos, reqBytesLength, sums);
+
+        if (leadingNum == 2){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+        }else if (leadingNum == 3){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+            tmp.y = c4value[(pos>>16)&0xff].y; 
+        }else if (leadingNum == 1){
+            tmp.w = c4value[pos&0xff].w; 
+        }else if (leadingNum == 4){
+            tmp.w = c4value[pos&0xff].w; 
+            tmp.z = c4value[(pos>>8)&0xff].z;
+            tmp.y = c4value[(pos>>16)&0xff].y; 
+            tmp.x = c4value[pos>>24].x; 
+        }
+        c4value[tid] = tmp;
+        __syncthreads();                  
+        ivalue[tid] = ivalue[tid] << rightShiftBits;
+
+        fbytes[b*bs+tid] = value[tid] + medianValue;
+    }
+}
diff --git a/qtensor/compression/szx/src/pred_quant.c b/qtensor/compression/szx/src/pred_quant.c
new file mode 100644
index 00000000..e69de29b
diff --git a/qtensor/compression/szx/src/szx.c b/qtensor/compression/szx/src/szx.c
new file mode 100644
index 00000000..ed4d1bef
--- /dev/null
+++ b/qtensor/compression/szx/src/szx.c
@@ -0,0 +1,439 @@
+/**
+ *  @file sz.c
+ *  @author Sheng Di
+ *  @date Jan, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "szx.h"
+#include "szx_rw.h"
+
+int versionNumber[4] = {SZx_VER_MAJOR,SZx_VER_MINOR,SZx_VER_BUILD,SZx_VER_REVISION};
+
+int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
+int sysEndianType = LITTLE_ENDIAN_SYSTEM; //*sysEndianType is actually set automatically.
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	int dimension;
+	if(r1==0)
+	{
+		dimension = 0;
+	}
+	else if(r2==0)
+	{
+		dimension = 1;
+	}
+	else if(r3==0)
+	{
+		dimension = 2;
+	}
+	else if(r4==0)
+	{
+		dimension = 3;
+	}
+	else if(r5==0)
+	{
+		dimension = 4;
+	}
+	else
+	{
+		dimension = 5;
+	}
+	return dimension;
+}
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t dataLength;
+	if(r1==0)
+	{
+		dataLength = 0;
+	}
+	else if(r2==0)
+	{
+		dataLength = r1;
+	}
+	else if(r3==0)
+	{
+		dataLength = r1*r2;
+	}
+	else if(r4==0)
+	{
+		dataLength = r1*r2*r3;
+	}
+	else if(r5==0)
+	{
+		dataLength = r1*r2*r3*r4;
+	}
+	else
+	{
+		dataLength = r1*r2*r3*r4*r5;
+	}
+	return dataLength;
+}
+
+/**
+ * @brief		check dimension and correct it if needed
+ * @return 	0 (didn't change dimension)
+ * 					1 (dimension is changed)
+ * 					2 (dimension is problematic)
+ **/
+int filterDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t* correctedDimension)
+{
+	int dimensionCorrected = 0;
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	correctedDimension[0] = r1;
+	correctedDimension[1] = r2;
+	correctedDimension[2] = r3;
+	correctedDimension[3] = r4;
+	correctedDimension[4] = r5;
+	size_t* c = correctedDimension;
+	if(dim==1)
+	{
+		if(r1<1)
+			return 2;
+	}
+	else if(dim==2)
+	{
+		if(r2==1)
+		{
+			c[1]= 0;
+			dimensionCorrected = 1;
+		}	
+		if(r1==1) //remove this dimension
+		{
+			c[0] = c[1]; 
+			c[1] = c[2];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==3)
+	{
+		if(r3==1)
+		{
+			c[2] = 0;
+			dimensionCorrected = 1;
+		}	
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==4)
+	{
+		if(r4==1)
+		{
+			c[3] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r3==1)
+		{
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			dimensionCorrected = 1;
+		}
+	}
+	else if(dim==5)
+	{
+		if(r5==1)
+		{
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r4==1)
+		{
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r3==1)
+		{
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r2==1)
+		{
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+		if(r1==1)
+		{
+			c[0] = c[1];
+			c[1] = c[2];
+			c[2] = c[3];
+			c[3] = c[4];
+			c[4] = 0;
+			dimensionCorrected = 1;
+		}
+	}
+	
+	return dimensionCorrected;
+	
+}
+
+unsigned char* SZ_fast_compress_args(int fastMode, int dataType, void *data, size_t *outSize, int errBoundMode, float absErrBound,
+float relBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char*  bytes = NULL;
+	size_t length = computeDataLength(r5, r4, r3, r2, r1);
+	size_t i = 0;
+	
+	if(dataType == SZ_FLOAT)
+	{
+		if(fastMode == SZx_WITH_BLOCK_FAST_CMPR || fastMode == SZx_RANDOMACCESS_FAST_CMPR || fastMode == SZx_OPENMP_FAST_CMPR)
+		{
+			float realPrecision = absErrBound;
+			if(errBoundMode==REL)
+			{
+				float* oriData = (float*)data;
+				float min = oriData[0];
+				float max = oriData[0];
+				for(i=0;i<length;i++)
+				{
+					float v = oriData[i];
+					if(min>v)
+						min = v;
+					else if(max<v)
+						max = v;
+				}
+				float valueRange = max - min;
+				realPrecision = valueRange*relBoundRatio;
+			}
+
+			int blockSize = 128;
+			if (fastMode == SZx_RANDOMACCESS_FAST_CMPR) {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(data, outSize, realPrecision, length, blockSize);
+			} 
+			else if(fastMode == SZx_OPENMP_FAST_CMPR)
+			{
+				#ifdef _OPENMP
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(data, outSize, realPrecision, length,
+																							  blockSize);
+				#else
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(data, outSize, realPrecision, length, blockSize);
+				printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the compression is degraded to serial version automatically.\n");
+				#endif
+			}
+			else {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_float(data, outSize, realPrecision, length, blockSize);
+			}
+			return bytes;
+		}
+		else
+		{
+			//compute value range
+			float* oriData = (float*)data;
+			float min = oriData[0];
+			float max = oriData[0];
+			for(i=0;i<length;i++)
+			{
+				float v = oriData[i];
+				if(min>v)
+					min = v;
+				else if(max<v)
+					max = v;
+			}
+			float valueRange = max - min;
+			float radius = valueRange/2;
+			float medianValue = min + radius;
+
+			float realPrecision = 0;
+			if(errBoundMode==ABS)
+				realPrecision = absErrBound;
+			else if(errBoundMode==REL)
+				realPrecision = valueRange*relBoundRatio;
+
+			bytes = SZ_fast_compress_args_unpredictable_float(data, outSize, realPrecision, r5, r4, r3, r2, r1, medianValue, radius);		
+		}
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		if(fastMode == SZx_WITH_BLOCK_FAST_CMPR || fastMode == SZx_RANDOMACCESS_FAST_CMPR || fastMode == SZx_OPENMP_FAST_CMPR)
+		{
+			float realPrecision = absErrBound;
+			if(errBoundMode==REL)
+			{
+				double* oriData = (double*)data;
+				double min = oriData[0];
+				double max = oriData[0];
+				for(i=0;i<length;i++)
+				{
+					double v = oriData[i];
+					if(min>v)
+						min = v;
+					else if(max<v)
+						max = v;
+				}
+				double valueRange = max - min;
+				realPrecision = valueRange*relBoundRatio;
+			}
+
+			int blockSize = 128;
+			if (fastMode == SZx_RANDOMACCESS_FAST_CMPR) {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(data, outSize, realPrecision, length, blockSize);
+			} 
+			else if(fastMode == SZx_OPENMP_FAST_CMPR)
+			{
+				#ifdef _OPENMP
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(data, outSize, realPrecision, length,
+																							  blockSize);
+				#else
+				bytes = SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(data, outSize, realPrecision, length, blockSize);
+				printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the compression is degraded to serial version automatically.\n");
+				#endif
+			}
+			else {
+				bytes = SZ_fast_compress_args_unpredictable_blocked_double(data, outSize, realPrecision, length, blockSize);
+			}
+			return bytes;
+		}
+		else
+		{
+			//compute value range
+			double* oriData = (double*)data;
+			double min = oriData[0];
+			double max = oriData[0];
+			for(i=0;i<length;i++)
+			{
+				double v = oriData[i];
+				if(min>v)
+					min = v;
+				else if(max<v)
+					max = v;
+			}
+			double valueRange = max - min;
+			float radius = valueRange/2;
+			float medianValue = min + radius;
+
+			float realPrecision = 0;
+			if(errBoundMode==ABS)
+				realPrecision = absErrBound;
+			else if(errBoundMode==REL)
+				realPrecision = valueRange*relBoundRatio;
+
+			bytes = SZ_fast_compress_args_unpredictable_double(data, outSize, realPrecision, r5, r4, r3, r2, r1, medianValue, radius);		
+		}		
+	}
+
+    return bytes;
+
+}
+
+/**
+ * @deprecated
+ * */
+void* SZ_fast_decompress_pred(int dataType, float* preData, unsigned char *curBytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+    int x = 1;
+    char *y = (char*)&x;
+    if(*y==1)
+        sysEndianType = LITTLE_ENDIAN_SYSTEM;
+    else //=0
+        sysEndianType = BIG_ENDIAN_SYSTEM;
+
+    if(dataType == SZ_FLOAT)
+    {
+        float* newFloatData = NULL;
+        SZ_fast_decompress_args_with_prediction_float(&newFloatData, preData, r5, r4, r3, r2, r1, curBytes, byteLength);
+        return newFloatData;
+    }
+    else if(dataType == SZ_DOUBLE)
+    {
+        double* newDoubleData = NULL;
+        //SZ_fast_decompress_args_unpredictable_float(&newDoubleData, r5, r4, r3, r2, r1, bytes, byteLength, 0, NULL);
+        return newDoubleData;
+    }
+
+    return NULL;
+}
+
+void* SZ_fast_decompress(int fastMode, int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+    int x = 1;
+    char *y = (char*)&x;
+    if(*y==1)
+        sysEndianType = LITTLE_ENDIAN_SYSTEM;
+    else //=0
+        sysEndianType = BIG_ENDIAN_SYSTEM;
+
+    if(dataType == SZ_FLOAT)
+    {
+        float* newFloatData = NULL;
+        if(fastMode == SZx_NO_BLOCK_FAST_CMPR)
+            SZ_fast_decompress_args_unpredictable_float(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength);
+		else if(fastMode == SZx_WITH_BLOCK_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_float(&newFloatData, nbEle, bytes);            
+        else if(fastMode == SZx_RANDOMACCESS_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(&newFloatData, nbEle, bytes);
+        else //SZx_openmp
+        {
+#ifdef _OPENMP
+                SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(&newFloatData, nbEle, bytes);
+#else
+                SZ_fast_decompress_args_unpredictable_blocked_float(&newFloatData, nbEle, bytes);
+                printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the decompression is degraded to serial version automatically.\n");
+#endif
+        }
+        return newFloatData;
+    }
+    else if(dataType == SZ_DOUBLE)
+    {
+        double* newFloatData = NULL;
+        if(fastMode == SZx_NO_BLOCK_FAST_CMPR)
+            SZ_fast_decompress_args_unpredictable_double(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength);
+		else if(fastMode == SZx_WITH_BLOCK_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_double(&newFloatData, nbEle, bytes);            
+        else if(fastMode == SZx_RANDOMACCESS_FAST_CMPR)
+			SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(&newFloatData, nbEle, bytes);
+        else //SZx_openmp
+        {
+#ifdef _OPENMP
+                SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(&newFloatData, nbEle, bytes);
+#else
+                SZ_fast_decompress_args_unpredictable_blocked_double(&newFloatData, nbEle, bytes);
+                printf("WARNING: It seems that you want to run the code with openmp mode but you didn't compile the code in openmp mode.\nSo, the decompression is degraded to serial version automatically.\n");
+#endif
+        }
+        return newFloatData;
+    }
+
+    return NULL;
+}
diff --git a/qtensor/compression/szx/src/szx_BytesToolkit.c b/qtensor/compression/szx/src/szx_BytesToolkit.c
new file mode 100644
index 00000000..9d684ad8
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_BytesToolkit.c
@@ -0,0 +1,811 @@
+/**
+ *  @file szx_ByteToolkit.c
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+ 
+#include <stdlib.h>
+#include "szx.h" 	
+#include "szx_BytesToolkit.h"
+#include "szx_dataCompression.h"
+
+inline void sz_writeBits_Fast_int8(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, unsigned char data)
+{
+    unsigned char mask = (1 << numBits)-1;
+    *(buffer + ((*bitPosPtr)>>3)) |= (data & mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+inline void sz_writeBits_Fast_int32(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int32_t data)
+{
+    uint32_t mask = (1 << numBits)-1;
+    *(uint32_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint32_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+inline void sz_writeBits_Fast_int64(unsigned char* buffer,uint64_t *bitPosPtr, int numBits, int64_t data)
+{
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint64_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    (*bitPosPtr) += numBits;
+}
+
+
+inline unsigned short bytesToUInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	unsigned short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline unsigned int bytesToUInt32_bigEndian(unsigned char* bytes)
+{
+	unsigned int temp = 0;
+	unsigned int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline unsigned long bytesToUInt64_bigEndian(unsigned char* b) {
+	unsigned long temp = 0;
+	unsigned long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+	
+inline short bytesToInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline int bytesToInt32_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline long bytesToInt64_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline int bytesToInt_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+/**
+ * @unsigned char *b the variable to store the converted bytes (length=4)
+ * @unsigned int num
+ * */
+inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);	
+	
+	//note: num >> xxx already considered endian_type...
+//if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
+}
+
+inline void int64ToBytes_bigEndian(unsigned char *b, uint64_t num)
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+}
+
+inline void int32ToBytes_bigEndian(unsigned char *b, uint32_t num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);		
+}
+
+inline void int16ToBytes_bigEndian(unsigned char *b, uint16_t num)
+{
+	b[0] = (unsigned char)(num >> 8);	
+	b[1] = (unsigned char)(num);
+}
+
+/**
+ * @endianType: refers to the endian_type of unsigned char* b.
+ * */
+inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+
+inline long doubleToOSEndianLong(double value)
+{
+	ldouble buf;
+	buf.value = value;
+	return buf.lvalue;
+}
+
+inline int floatToOSEndianInt(float value)
+{
+	lfloat buf;
+	buf.value = value;
+	return buf.ivalue;
+}
+
+//TODO: debug: lfBuf.lvalue could be actually little_endian....
+inline short getExponent_float(float value)
+{
+	//int ivalue = floatToBigEndianInt(value);
+
+	lfloat lbuf;
+	lbuf.value = value;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_float(float precision)
+{
+	lfloat lbuf;
+	lbuf.value = precision;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+//	unsigned char the1stManBit = (unsigned char)((ivalue & 0x00400000) >> 22);
+//	if(the1stManBit==1)
+//		expValue--;	
+	return (short)expValue;
+}
+
+inline short getExponent_double(double value)
+{
+	//long lvalue = doubleToBigEndianLong(value);
+	
+	ldouble lbuf;
+	lbuf.value = value;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_double(double precision)
+{
+	ldouble lbuf;
+	lbuf.value = precision;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+//	unsigned char the1stManBit = (unsigned char)((lvalue & 0x0008000000000000) >> 51);
+//	if(the1stManBit==1)
+//		expValue--;
+	return (short)expValue;
+}
+
+inline unsigned char numberOfLeadingZeros_Int(int i) {
+	if (i == 0)
+		return 32;
+	unsigned char n = 1;
+	if (((unsigned int)i) >> 16 == 0) { n += 16; i <<= 16; }
+	if (((unsigned int)i) >> 24 == 0) { n +=  8; i <<=  8; }
+	if (((unsigned int)i) >> 28 == 0) { n +=  4; i <<=  4; }
+	if (((unsigned int)i) >> 30 == 0) { n +=  2; i <<=  2; }
+	n -= ((unsigned int)i) >> 31;
+	return n;
+}
+
+inline unsigned char numberOfLeadingZeros_Long(long i) {
+	 if (i == 0)
+		return 64;
+	unsigned char n = 1;
+	int x = (int)(((unsigned long)i) >> 32);
+	if (x == 0) { n += 32; x = (int)i; }
+	if (((unsigned int)x) >> 16 == 0) { n += 16; x <<= 16; }
+	if (((unsigned int)x) >> 24 == 0) { n +=  8; x <<=  8; }
+	if (((unsigned int)x) >> 28 == 0) { n +=  4; x <<=  4; }
+	if (((unsigned int)x) >> 30 == 0) { n +=  2; x <<=  2; }
+	n -= ((unsigned int)x) >> 31;
+	return n;
+}
+
+inline unsigned char getLeadingNumbers_Int(int v1, int v2)
+{
+	int v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Int(v);
+}
+
+inline unsigned char getLeadingNumbers_Long(long v1, long v2)
+{
+	long v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Long(v);
+}
+
+/**
+ * By default, the endian type is OS endian type.
+ * */
+inline short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+inline void shortToBytes(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+inline int bytesToInt(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	return buf.ivalue;
+}
+
+inline long bytesToLong(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	return buf.lvalue;
+}
+
+//the byte to input is in the big-endian format
+inline float bytesToFloat(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(buf.byte);	
+	return buf.value;
+}
+
+inline void floatToBytes(unsigned char *b, float num)
+{
+	lfloat buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(b);		
+}
+
+//the byte to input is in the big-endian format
+inline double bytesToDouble(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(buf.byte);
+	return buf.value;
+}
+
+inline void doubleToBytes(unsigned char *b, double num)
+{
+	ldouble buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(b);
+}
+
+
+inline int getMaskRightCode(int m) {
+	switch (m) {
+	case 1:
+		return 0x01;
+	case 2:
+		return 0x03;
+	case 3:
+		return 0x07;
+	case 4:
+		return 0x0F;
+	case 5:
+		return 0x1F;
+	case 6:
+		return 0x3F;
+	case 7:
+		return 0X7F;
+	case 8:
+		return 0XFF;
+	default:
+		return 0;
+	}
+}
+
+inline int getLeftMovingCode(int kMod8)
+{
+	return getMaskRightCode(8 - kMod8);
+}
+
+inline int getRightMovingSteps(int kMod8, int resiBitLength) {
+	return 8 - kMod8 - resiBitLength;
+}
+
+inline int getRightMovingCode(int kMod8, int resiBitLength)
+{
+	int rightMovingSteps = 8 - kMod8 - resiBitLength;
+	if(rightMovingSteps < 0)
+	{
+		switch(-rightMovingSteps)
+		{
+		case 1:
+			return 0x80;
+		case 2:
+			return 0xC0;
+		case 3:
+			return 0xE0;
+		case 4:
+			return 0xF0;
+		case 5:
+			return 0xF8;
+		case 6:
+			return 0xFC;
+		case 7:
+			return 0XFE;
+		default:
+			return 0;
+		}    		
+	}
+	else //if(rightMovingSteps >= 0)
+	{
+		int a = getMaskRightCode(8 - kMod8);
+		int b = getMaskRightCode(8 - kMod8 - resiBitLength);
+		int c = a - b;
+		return c;
+	}
+}
+
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	short* states = (short*)malloc(stateLength*sizeof(short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.svalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.svalue;
+		}		
+	}
+	return states;
+} 
+
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	unsigned short* states = (unsigned short*)malloc(stateLength*sizeof(unsigned short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.usvalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.usvalue;
+		}		
+	}
+	return states;
+} 
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];	
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];			
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+
+inline size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+inline void sizeToBytes(unsigned char* outBytes, size_t size)
+{
+		longToBytes_bigEndian(outBytes, size);//8
+}
+
diff --git a/qtensor/compression/szx/src/szx_TypeManager.c b/qtensor/compression/szx/src/szx_TypeManager.c
new file mode 100644
index 00000000..5a4af0b6
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_TypeManager.c
@@ -0,0 +1,381 @@
+/**
+ *  @file TypeManager.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "szx.h"
+
+size_t convertIntArray2ByteArray_fast_1b_args(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			//if(type == 1)
+			tmp = (tmp | (type << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_1b_args(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char* intArray)
+{   
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		intArray[n++] = (tmp & 0x80) >> 7;
+		intArray[n++] = (tmp & 0x40) >> 6;
+		intArray[n++] = (tmp & 0x20) >> 5;
+		intArray[n++] = (tmp & 0x10) >> 4;
+		intArray[n++] = (tmp & 0x08) >> 3;
+		intArray[n++] = (tmp & 0x04) >> 2;
+		intArray[n++] = (tmp & 0x02) >> 1;
+		intArray[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	intArray[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	intArray[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	intArray[n++] = (tmp & 0x01) >> 0;	
+}
+
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
+{
+    if(intArrayLength > byteArrayLength*8)
+    {
+    	printf("Error: intArrayLength > byteArrayLength*8\n");
+    	printf("intArrayLength=%zu, byteArrayLength = %zu", intArrayLength, byteArrayLength);
+    	exit(0);
+    }
+	if(intArrayLength>0)
+		*intArray = (unsigned char*)malloc(intArrayLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;    
+    
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0x80) >> 7;
+		(*intArray)[n++] = (tmp & 0x40) >> 6;
+		(*intArray)[n++] = (tmp & 0x20) >> 5;
+		(*intArray)[n++] = (tmp & 0x10) >> 4;
+		(*intArray)[n++] = (tmp & 0x08) >> 3;
+		(*intArray)[n++] = (tmp & 0x04) >> 2;
+		(*intArray)[n++] = (tmp & 0x02) >> 1;
+		(*intArray)[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x01) >> 0;		
+}
+
+
+inline size_t convertIntArray2ByteArray_fast_2b_args(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result)
+{
+	register unsigned char tmp = 0;
+	size_t i, j = 0, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	size_t n = 0;
+	if(timeStepTypeLength%4==0)
+	{
+		for(i = 0;i<byteLength;i++)
+		{
+			tmp = 0;
+
+			tmp |= timeStepType[n++] << 6;
+			tmp |= timeStepType[n++] << 4;
+			tmp |= timeStepType[n++] << 2;
+			tmp |= timeStepType[n++];
+
+		/*	for(j = 0;j<4;j++) 
+			{
+				unsigned char type = timeStepType[n++];
+				tmp = tmp | type << (6-(j<<1));
+			}*/
+
+			result[i] = tmp;
+		}		
+	}
+	else
+	{
+		size_t byteLength_ = byteLength - 1;
+		for(i = 0;i<byteLength_;i++)
+		{
+			tmp = 0;
+
+			tmp |= timeStepType[n++] << 6;
+			tmp |= timeStepType[n++] << 4;
+			tmp |= timeStepType[n++] << 2;
+			tmp |= timeStepType[n++];	
+
+		/*	for(j = 0;j<4;j++)
+			{
+				unsigned char type = timeStepType[n++];
+				tmp = tmp | type << (6-(j<<1));
+			}*/
+
+			result[i] = tmp;
+		}
+		tmp = 0;
+        int mod4 = timeStepTypeLength%4;
+        for(j=0;j<mod4;j++)
+		{
+			unsigned char type = timeStepType[n++];
+			tmp = tmp | type << (6-(j<<1));			
+		}
+		result[i] = tmp;
+	}
+
+/*	//The original version (the slowest version)
+ * for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			unsigned char type = timeStepType[n++];
+			tmp = tmp | type << (6-(j<<1));
+		}
+
+		result[i] = tmp;
+	}
+*/
+	return byteLength;
+}
+
+/**
+ * little endian
+ * [01|10|11|00|....]-->[01|10|11|00][....]
+ * @param timeStepType
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{
+	if(stepLength > byteArrayLength*4)
+	{
+		printf("Error: stepLength > byteArray.length*4\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i, n = 0;
+
+	int mod4 = stepLength%4;
+	if(mod4==0)
+	{
+		for (i = 0; i < byteArrayLength; i++) {
+			unsigned char tmp = byteArray[i];
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;
+			(*intArray)[n++] = tmp & 0x03;
+		}	
+	}
+	else
+	{
+		size_t t = byteArrayLength - mod4;
+		for (i = 0; i < t; i++) {
+			unsigned char tmp = byteArray[i];
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;
+			(*intArray)[n++] = tmp & 0x03;
+		}
+		unsigned char tmp = byteArray[i];				
+		switch(mod4)
+		{
+		case 1:
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			break;
+		case 2:
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;			
+			break;
+		case 3:	
+			(*intArray)[n++] = (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = (tmp & 0x30) >> 4;
+			(*intArray)[n++] = (tmp & 0x0C) >> 2;		
+			break;
+		}
+	}
+}
+
+
+inline int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+{
+	return 8 - k%8 - resiBitLength;
+}
+
+
diff --git a/qtensor/compression/szx/src/szx_dataCompression.c b/qtensor/compression/szx/src/szx_dataCompression.c
new file mode 100644
index 00000000..d5130a93
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_dataCompression.c
@@ -0,0 +1,355 @@
+/**
+ *  @file double_compression.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date April, 2016
+ *  @brief Compression Technique for double array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "szx.h"
+#include "szx_dataCompression.h"
+#include "szx_BytesToolkit.h"
+
+int computeByteSizePerIntValue(long valueRangeSize)
+{
+	if(valueRangeSize<=256)
+		return 1;
+	else if(valueRangeSize<=65536)
+		return 2;
+	else if(valueRangeSize<=4294967296) //2^32
+		return 4;
+	else
+		return 8;
+}
+
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize)
+{
+	size_t i = 0;
+	long max = 0, min = 0;
+
+	if(dataType==SZ_UINT8)
+	{
+		unsigned char* data = (unsigned char*)oriData;
+		unsigned char data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT8)
+	{
+		char* data = (char*)oriData;
+		char data_;
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		unsigned short* data = (unsigned short*)oriData;
+		unsigned short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT16)
+	{ 
+		short* data = (short*)oriData;
+		short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		unsigned int* data = (unsigned int*)oriData;
+		unsigned int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int* data = (int*)oriData;
+		int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		unsigned long* data = (unsigned long*)oriData;
+		unsigned long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT64)
+	{
+		long* data = (long *)oriData;
+		long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+
+	*valueRangeSize = max - min;
+	return min;	
+}
+
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue)
+{
+	size_t i = 0;
+	float min = oriData[0];
+	float max = min;
+	for(i=1;i<size;i++)
+	{
+		float data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue)
+{
+	size_t i = 0;
+	double min = oriData[0];
+	double max = min;
+	for(i=1;i<size;i++)
+	{
+		double data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+	
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double min_d(double a, double b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+double max_d(double a, double b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+float min_f(float a, float b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+float max_f(float a, float b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = -1;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+inline void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+	
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+	
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+inline void symTransform_2bytes(unsigned char data[2])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[1];
+	data[1] = tmp;
+}
+
+inline void symTransform_4bytes(unsigned char data[4])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[3];
+	data[3] = tmp;
+
+	tmp = data[1];
+	data[1] = data[2];
+	data[2] = tmp;
+}
+
+inline void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<8;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+
+inline int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<4;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
diff --git a/qtensor/compression/szx/src/szx_double.c b/qtensor/compression/szx/src/szx_double.c
new file mode 100644
index 00000000..34bd2b4d
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_double.c
@@ -0,0 +1,1388 @@
+/**
+ *  @file szx_double.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Aug, 2022
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "szx.h"
+#include "szx_double.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include <assert.h>
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__)  || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+inline void SZ_fast_compress_args_unpredictable_one_block_double(double *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float mValue,
+                                                                float radius) {
+	double medianValue = mValue;
+    size_t totalSize = 0, i = 0;
+
+    int reqLength;
+
+    //compute median, value range, and radius
+
+    short radExpo = getExponent_float(radius);
+    computeReqLength_double(absErrBound, radExpo, &reqLength, &mValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = nbEle % 4 == 0 ? nbEle / 4 : nbEle / 4 + 1;
+
+    register ldouble lfBuf_pre;
+    register ldouble lfBuf_cur;
+    lfBuf_pre.lvalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 1 + sizeof(float);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+
+        if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[7];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }else if(reqBytesLength == 4) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if (reqBytesLength == 5)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 2;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if(reqBytesLength == 6)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 3;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+        else if(reqBytesLength == 7)
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 4;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else //reqLength == 8
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 7] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 8;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 5;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, nbEle, leadNumberArray);
+        int k = 0;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), mValue);
+        k += sizeof(float);
+        //sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+        //outputBytes[k] = leadNumberArray_size;  //leadNumberArray_size can be calculated based on block size (=blockSize/4)
+
+        totalSize = 1 + sizeof(float) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+}
+
+size_t computeStateMedianRadius_double(double *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) {
+    size_t nbConstantBlocks = 0;
+    size_t i = 0, j = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t offset = 0;
+
+    for (i = 0; i < nbBlocks; i++) {
+        double min = oriData[offset];
+        double max = oriData[offset];
+        for (j = 1; j < blockSize; j++) {
+            double v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        double valueRange = max - min;
+        double radius = valueRange / 2;
+        double medianValue = min + radius;
+
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+
+        stateArray[i] = radius <= absErrBound ? 0 : 1;
+        medianArray[i] = (float)medianValue;
+        radiusArray[i] = (float)radius;
+        offset += blockSize;
+    }
+
+    int remainCount = nbEle % blockSize;
+    if (remainCount != 0) {
+        double min = oriData[offset];
+        double max = oriData[offset];
+        for (j = 1; j < remainCount; j++) {
+            double v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        double valueRange = max - min;
+        double radius = valueRange / 2;
+        double medianValue = min + radius;
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+        medianArray[i] = (float)medianValue;
+        radiusArray[i] = (float)radius;
+    }
+    return nbConstantBlocks;
+}
+
+
+void max_min_double(double *x, int n, double *tmp_max, double *tmp_min) {
+    for (size_t i = 0; i < n; i++) {
+        if (x[i] > *tmp_max) {
+            *tmp_max = x[i];
+        }
+        if (x[i] < *tmp_min) {
+            *tmp_min = x[i];
+        }
+    }
+}
+
+void simd_max_min_double(double *x, int n, double *tmp_max, double *tmp_min) {
+    *tmp_max = x[0];
+    *tmp_min = x[0];
+#ifdef  __AVX512F__
+    //    printf("use avx512, n=%d \n", n);
+    int n16 = n & -16, i = 0, j=0;
+    if (n > 16) {
+        double *ptr_x = x;
+        __m512 max1 = _mm512_loadu_ps(ptr_x);
+//        __m512 max2 = _mm512_loadu_ps(ptr_x + 16);
+        __m512 min1 = max1;
+//        __m512 min2 = max2;
+        __m512 tmp1;
+//        __m512 tmp2;
+        for (; i < n16; i += 16) {
+            tmp1 = _mm512_loadu_ps(ptr_x);
+            max1 = _mm512_max_ps(tmp1, max1);
+            min1 = _mm512_min_ps(tmp1, min1);
+//            tmp2 = _mm512_loadu_ps(ptr_x+16);
+//            max2 = _mm512_max_ps(tmp2, max2);
+//            min2 = _mm512_min_ps(tmp2, min2);
+            ptr_x += 16;
+        }
+//        max1 = _mm512_max_ps(max1, max2);
+//        min1 = _mm512_min_ps(min1, min2);
+          __m256 max256 = _mm256_max_ps(_mm512_extractf32x8_ps(max1,0), _mm512_extractf32x8_ps(max1,1));
+          __m128 max128 = _mm_max_ps(_mm256_extractf128_ps(max256,0), _mm256_extractf128_ps(max256,1));
+          __m256 min256 = _mm256_min_ps(_mm512_extractf32x8_ps(min1,0), _mm512_extractf32x8_ps(min1,1));
+          __m128 min128 = _mm_min_ps(_mm256_extractf128_ps(min256,0), _mm256_extractf128_ps(min256,1));
+          for (j=0;j<4;j++){
+            *tmp_max = *tmp_max < max128[j] ? max128[j] : *tmp_max;
+            *tmp_min = *tmp_min > min128[j] ? min128[j] : *tmp_min;
+          }
+
+        if ( i < n ) {
+            max_min_double(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_double(x, n, tmp_max, tmp_min);
+    }
+#elif __AVX2__
+//        printf("use avx2, n=%d \n", n);
+    //    fflush(stdout);
+    int n16 = n & -16, i = 0;
+    if (n > 16) {
+        double *ptr_x = x;
+        __m256 max1 = _mm256_loadu_ps(ptr_x);
+        __m256 max2 = _mm256_loadu_ps(ptr_x + 8);
+        __m256 min1 = max1;
+        __m256 min2 = max2;
+        for (; i < n16; i += 16) {
+            max1 = _mm256_max_ps(_mm256_loadu_ps(ptr_x), max1);
+            min1 = _mm256_min_ps(_mm256_loadu_ps(ptr_x), min1);
+            max2 = _mm256_max_ps(_mm256_loadu_ps(ptr_x + 8), max2);
+            min2 = _mm256_min_ps(_mm256_loadu_ps(ptr_x + 8), min2);
+            ptr_x += 16;
+        }
+//        printf("%d %d %d\n", n, n16, i);
+//        exit(0);
+        max1 = _mm256_max_ps(max1, max2);
+        min1 = _mm256_min_ps(min1, min2);
+        for (int j = 0; j < 8; j++) {
+            *tmp_max = *tmp_max < max1[j] ? max1[j] : *tmp_max;
+            *tmp_min = *tmp_min > min1[j] ? min1[j] : *tmp_min;
+        }
+        if ( i < n ) {
+            max_min_double(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_double(x, n, tmp_max, tmp_min);
+    }
+#else
+    max_min_double(x, n, tmp_max, tmp_min);
+#endif
+}
+
+void computeStateMedianRadius_double2(double *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) {
+     double min = oriData[0];
+     double max = oriData[0];
+     simd_max_min_double(oriData, nbEle, &max, &min);
+
+    double valueRange = max - min;
+    *radius = valueRange / 2;
+    *median = min + *radius;
+
+    if (*radius <= absErrBound) {
+        *state = 0;
+    } else {
+        *state = 1;
+    }
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_double(double *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) {
+    double *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t stateNBBytes =
+            remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+                                                                                        (nbBlocks + 1) / 8 :
+                                                                                        (nbBlocks + 1) / 8 + 1);
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_double(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 0; // indicates this is not a random access version
+    r[4] = (unsigned char) blockSize;
+    r = r + 5; //1 byte
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'stateNBBytes'
+
+    unsigned char *p = r + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize += (3 + 1 + 1 + sizeof(size_t) + stateNBBytes + sizeof(float) * nbConstantBlocks);
+
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, r);
+	
+    free(stateArray);
+    free(medianArray);	
+    free(radiusArray);
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double_openmp(double *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) {
+#ifdef _OPENMP
+    printf("use openmp\n");
+
+#ifdef __AVX512F__
+    printf("use avx512\n");
+#elif __AVX2__
+    printf("use avx2\n");
+#else
+#endif
+    printf("blockSize = %d\n",blockSize);
+    sz_cost_start();
+    double *op = oriData;
+
+    size_t i = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbNonConstantBlocks = 0;
+
+    unsigned char *tmp_q = (unsigned char *) malloc(blockSize * sizeof(double) * actualNBBlocks);
+    int *outSizes = (int *) malloc(actualNBBlocks * sizeof(int));
+    size_t *outSizesAccumlate = (size_t *) malloc(actualNBBlocks * sizeof(size_t));
+    int *nbNonConstantBlockAccumlate = (int *) malloc(actualNBBlocks * sizeof(int));
+
+    (*outSize) = 0;
+    size_t maxPreservedBufferSize =
+    sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //4 byte
+
+    int nbThreads = 1;
+    unsigned char *leadNumberArray_int;
+    size_t z0[200],z1[200];
+
+    size_t nbConstantBlocks;
+    unsigned char *R, *p, *q;
+    float *pf;
+    uint16_t *O;
+
+#pragma omp parallel
+{
+#pragma omp single
+{
+    nbThreads = omp_get_num_threads();
+    //printf("nbThreads = %d\n", nbThreads);
+    assert(nbThreads<200);
+    leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int) * nbThreads);
+
+    //sz_cost_end_msg("sequential-1 malloc");
+    //sz_cost_start();
+}
+#pragma omp for reduction(+:nbNonConstantBlocks) schedule(static)
+    for (i = 0; i < nbBlocks; i++) {
+        float radius;
+        computeStateMedianRadius_double2(op + i * blockSize, blockSize, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op + i * blockSize, blockSize, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int +
+                                                                omp_get_thread_num() * blockSize * sizeof(int),
+                                                                medianArray[i], radius);
+            outSizesAccumlate[i]=outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizes[i]=0;
+            outSizesAccumlate[i]=0;
+        }
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-1 compress");
+//    exit(0);
+    if (remainCount != 0) {
+        i = nbBlocks;
+        float radius;
+        computeStateMedianRadius_double2(op + i * blockSize, remainCount, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op + i * blockSize, remainCount, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int, medianArray[i], radius);
+            outSizesAccumlate[i] = outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizesAccumlate[i] = 0;
+            outSizes[i]=0;
+        }
+    }
+
+    nbConstantBlocks = actualNBBlocks - nbNonConstantBlocks;
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t);
+    O = (uint16_t*) r; //o is the starting address of 'block-size array'
+    R = r + nbNonConstantBlocks * sizeof(uint16_t); //R is the starting address of the state array
+    p = R + stateNBBytes; //p is the starting address of constant median values.
+    pf = (float *) p;
+    q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    // unsigned char *q0 = q;
+    // printf("%lu %lu %lu %lu\n",r-outputBytes, R-outputBytes, p-outputBytes, q-outputBytes);
+    // 3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q - outputBytes;
+
+//    sz_cost_start();
+
+}
+    int tid = omp_get_thread_num();
+    int lo = tid * actualNBBlocks / nbThreads;
+    int hi = (tid + 1) * actualNBBlocks / nbThreads;
+    int b;
+    nbNonConstantBlockAccumlate[lo]=stateArray[lo];
+    for (b = lo+1; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + outSizesAccumlate[b-1];
+    }
+    for (b = lo+1; b < hi; b++){
+        nbNonConstantBlockAccumlate[b]=stateArray[b]+nbNonConstantBlockAccumlate[b-1];
+    }
+    z0[tid] = outSizesAccumlate[hi-1];
+    z1[tid] = nbNonConstantBlockAccumlate[hi-1];
+    size_t offset0=0, offset1=0;
+#pragma omp barrier
+    for (int j = 0; j < tid; j++) {
+        offset0+=z0[j];
+        offset1+=z1[j];
+    }
+    for (b = lo; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + offset0;
+        nbNonConstantBlockAccumlate[b] = nbNonConstantBlockAccumlate[b] + offset1;
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-2 prefix sum");
+//    sz_cost_start();
+};
+#pragma omp for schedule(static)
+    for (i = 0; i < actualNBBlocks; i++) {
+        if (stateArray[i]) {
+            memcpy(q+outSizesAccumlate[i]-outSizes[i], tmp_q + i * blockSize * sizeof(float), outSizes[i]);
+            O[nbNonConstantBlockAccumlate[i]-1]=outSizes[i];
+        } else {
+            pf[i-nbNonConstantBlockAccumlate[i]]=medianArray[i];
+        }
+    }
+#pragma omp single
+{
+//    sz_cost_end_msg("parallel-3 memcpy");
+//    sz_cost_start();
+
+    *outSize += outSizesAccumlate[actualNBBlocks-1];
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+//    sz_cost_end_msg("sequential-2 int2byte");
+//    sz_cost_start();
+    free(nbNonConstantBlockAccumlate);
+    free(outSizesAccumlate);
+    free(leadNumberArray_int);
+    free(tmp_q);
+    free(medianArray);
+    free(stateArray);
+    free(outSizes);
+//    sz_cost_end_msg("sequential-3 free");
+//    printf("blocksize = %d, actualNBBlocks = %lu\n", blockSize, actualNBBlocks);
+//    printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f * (nbConstantBlocks * blockSize) / nbEle);
+//    printf("CR = %.3f, nbEle = %lu \n", nbEle*4.0/(*outSize), nbEle);
+}
+}
+    return outputBytes;
+#else
+    return NULL;
+#endif
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_double(double *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) {
+    double *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(double) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_double(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //1 byte
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'block-size array'
+    uint16_t *O=(uint16_t*)r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //R is the starting address of the state array
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q-outputBytes;
+
+    size_t nonConstantBlockID = 0;
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+            O[nonConstantBlockID++] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_double(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+            O[nonConstantBlockID] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_double(double *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius) {
+    size_t totalSize = 0;
+    double medianValue = mValue;
+
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+
+    size_t maxPreservedBufferSize =
+            sizeof(double) * dataLength; //assume that the compressed data size would not exceed the original size
+
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1; //SZx_VER_SUPERFAST
+    r[3] = 0; //support random access decompression
+
+//	sz_cost_start();
+    size_t i;
+    int reqLength;
+    short radExpo = getExponent_float(radius);
+
+    computeReqLength_double(absErrBound, radExpo, &reqLength, &mValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = dataLength % 4 == 0 ? dataLength / 4 : dataLength / 4 + 1;
+
+    register ldouble lfBuf_pre;
+    register ldouble lfBuf_cur;
+    lfBuf_pre.lvalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 4 + 1 + sizeof(float) + sizeof(size_t);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(dataLength);
+
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[5];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < dataLength; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[6];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[7];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+		}else if(reqBytesLength == 4) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[4];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if (reqBytesLength == 5)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 2;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else if(reqBytesLength == 6)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 3;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+        else if(reqBytesLength == 7)
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 5;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 4;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+        else //reqLength == 8
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+
+                lfBuf_pre.lvalue = lfBuf_cur.lvalue ^ lfBuf_pre.lvalue;
+
+                if (lfBuf_pre.lvalue >> 40 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.lvalue >> 48 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.lvalue >> 56 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    exactMidbyteArray[residualMidBytes_size + 7] = lfBuf_cur.byte[7];
+                    residualMidBytes_size += 8;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    exactMidbyteArray[residualMidBytes_size + 6] = lfBuf_cur.byte[6];
+                    residualMidBytes_size += 7;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    exactMidbyteArray[residualMidBytes_size + 5] = lfBuf_cur.byte[5];
+                    residualMidBytes_size += 6;
+                } else if (leadingNum == 3)
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    exactMidbyteArray[residualMidBytes_size + 4] = lfBuf_cur.byte[4];
+                    residualMidBytes_size += 5;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }        
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, dataLength, leadNumberArray);
+
+        int k = 4;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), mValue);
+        k += sizeof(float);
+        sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+
+        totalSize = 4 + 1 + sizeof(float) + sizeof(size_t) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+    free(leadNumberArray_int);
+//	sz_cost_end();
+//	printf("compression time = %f\n", sz_totalCost);
+
+    return outputBytes;
+}
+
+unsigned char *SZ_skip_compress_double(double *data, size_t dataLength, size_t *outSize) {
+    *outSize = dataLength * sizeof(double);
+    unsigned char *out = (unsigned char *) malloc(dataLength * sizeof(double));
+    memcpy(out, data, dataLength * sizeof(double));
+    return out;
+}
+
+inline void computeReqLength_double(float realPrecision, short radExpo, int* reqLength, float* medianValue)
+{
+        short reqExpo = getPrecisionReqLength_double(realPrecision);
+        *reqLength = 12+radExpo - reqExpo; //radExpo-reqExpo == reqMantiLength
+        if(*reqLength<12)
+                *reqLength = 12;
+        if(*reqLength>64)
+        {
+                *reqLength = 64;
+                *medianValue = 0;
+        }
+}
+
diff --git a/qtensor/compression/szx/src/szx_float.c b/qtensor/compression/szx/src/szx_float.c
new file mode 100644
index 00000000..010c2e4d
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_float.c
@@ -0,0 +1,975 @@
+/**
+ *  @file szx_float.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Aug, 2022
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "szx.h"
+#include "szx_float.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#include <assert.h>
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#if defined(__AVX__) || defined(__AVX2__)  || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+unsigned char *
+SZ_fast_compress_args_with_prediction_float(float *pred, float *data, size_t *outSize, float absErrBound, size_t r5,
+                                            size_t r4, size_t r3, size_t r2, size_t r1, float medianValue,
+                                            float radius) {
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+    float *delta = (float *) malloc(sizeof(float) * dataLength);
+    size_t i = 0;
+    for (i = 0; i < dataLength; i++)
+        delta[i] = data[i] - pred[i];
+    unsigned char *output = SZ_fast_compress_args_unpredictable_float(delta, outSize, absErrBound, r5, r4, r3, r2, r1,
+                                                                      medianValue, radius);
+    return output;
+}
+
+inline void SZ_fast_compress_args_unpredictable_one_block_float(float *oriData, size_t nbEle, float absErrBound,
+                                                                unsigned char *outputBytes, int *outSize,
+                                                                unsigned char *leadNumberArray_int, float medianValue,
+                                                                float radius) {
+    size_t totalSize = 0, i = 0;
+
+    int reqLength;
+
+    //compute median, value range, and radius
+
+    short radExpo = getExponent_float(radius);
+    computeReqLength_float(absErrBound, radExpo, &reqLength, &medianValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = nbEle % 4 == 0 ? nbEle / 4 : nbEle / 4 + 1;
+
+    register lfloat lfBuf_pre;
+    register lfloat lfBuf_cur;
+    lfBuf_pre.ivalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 1 + sizeof(float);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else //reqBytesLength == 4
+        {
+            for (i = 0; i < nbEle; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = oriData[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, nbEle, leadNumberArray);
+        int k = 0;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), medianValue);
+        k += sizeof(float);
+        //sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+        //outputBytes[k] = leadNumberArray_size;  //leadNumberArray_size can be calculated based on block size (=blockSize/4)
+
+        totalSize = 1 + sizeof(float) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+}
+
+size_t computeStateMedianRadius_float(float *oriData, size_t nbEle, float absErrBound, int blockSize,
+                                      unsigned char *stateArray, float *medianArray, float *radiusArray) {
+    size_t nbConstantBlocks = 0;
+    size_t i = 0, j = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t offset = 0;
+
+    for (i = 0; i < nbBlocks; i++) {
+        float min = oriData[offset];
+        float max = oriData[offset];
+        for (j = 1; j < blockSize; j++) {
+            float v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        float valueRange = max - min;
+        float radius = valueRange / 2;
+        float medianValue = min + radius;
+
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+
+        stateArray[i] = radius <= absErrBound ? 0 : 1;
+        medianArray[i] = medianValue;
+        radiusArray[i] = radius;
+        offset += blockSize;
+    }
+
+    int remainCount = nbEle % blockSize;
+    if (remainCount != 0) {
+        float min = oriData[offset];
+        float max = oriData[offset];
+        for (j = 1; j < remainCount; j++) {
+            float v = oriData[offset + j];
+            if (min > v)
+                min = v;
+            else if (max < v)
+                max = v;
+        }
+        float valueRange = max - min;
+        float radius = valueRange / 2;
+        float medianValue = min + radius;
+        if (radius <= absErrBound) {
+            stateArray[i] = 0;
+            nbConstantBlocks++;
+        } else
+            stateArray[i] = 1;
+        medianArray[i] = medianValue;
+        radiusArray[i] = radius;
+    }
+    return nbConstantBlocks;
+}
+
+
+void max_min_float(float *x, int n, float *tmp_max, float *tmp_min) {
+    for (size_t i = 0; i < n; i++) {
+        if (x[i] > *tmp_max) {
+            *tmp_max = x[i];
+        }
+        if (x[i] < *tmp_min) {
+            *tmp_min = x[i];
+        }
+    }
+}
+
+void simd_max_min_float(float *x, int n, float *tmp_max, float *tmp_min) {
+    *tmp_max = x[0];
+    *tmp_min = x[0];
+#ifdef  __AVX512F__
+    //    printf("use avx512, n=%d \n", n);
+    int n16 = n & -16, i = 0, j=0;
+    if (n > 16) {
+        float *ptr_x = x;
+        __m512 max1 = _mm512_loadu_ps(ptr_x);
+//        __m512 max2 = _mm512_loadu_ps(ptr_x + 16);
+        __m512 min1 = max1;
+//        __m512 min2 = max2;
+        __m512 tmp1;
+//        __m512 tmp2;
+        for (; i < n16; i += 16) {
+            tmp1 = _mm512_loadu_ps(ptr_x);
+            max1 = _mm512_max_ps(tmp1, max1);
+            min1 = _mm512_min_ps(tmp1, min1);
+//            tmp2 = _mm512_loadu_ps(ptr_x+16);
+//            max2 = _mm512_max_ps(tmp2, max2);
+//            min2 = _mm512_min_ps(tmp2, min2);
+            ptr_x += 16;
+        }
+//        max1 = _mm512_max_ps(max1, max2);
+//        min1 = _mm512_min_ps(min1, min2);
+          __m256 max256 = _mm256_max_ps(_mm512_extractf32x8_ps(max1,0), _mm512_extractf32x8_ps(max1,1));
+          __m128 max128 = _mm_max_ps(_mm256_extractf128_ps(max256,0), _mm256_extractf128_ps(max256,1));
+          __m256 min256 = _mm256_min_ps(_mm512_extractf32x8_ps(min1,0), _mm512_extractf32x8_ps(min1,1));
+          __m128 min128 = _mm_min_ps(_mm256_extractf128_ps(min256,0), _mm256_extractf128_ps(min256,1));
+          for (j=0;j<4;j++){
+            *tmp_max = *tmp_max < max128[j] ? max128[j] : *tmp_max;
+            *tmp_min = *tmp_min > min128[j] ? min128[j] : *tmp_min;
+          }
+
+        if ( i < n ) {
+            max_min_float(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_float(x, n, tmp_max, tmp_min);
+    }
+#elif __AVX2__
+//        printf("use avx2, n=%d \n", n);
+    //    fflush(stdout);
+    int n16 = n & -16, i = 0;
+    if (n > 16) {
+        float *ptr_x = x;
+        __m256 max1 = _mm256_loadu_ps(ptr_x);
+        __m256 max2 = _mm256_loadu_ps(ptr_x + 8);
+        __m256 min1 = max1;
+        __m256 min2 = max2;
+        for (; i < n16; i += 16) {
+            max1 = _mm256_max_ps(_mm256_loadu_ps(ptr_x), max1);
+            min1 = _mm256_min_ps(_mm256_loadu_ps(ptr_x), min1);
+            max2 = _mm256_max_ps(_mm256_loadu_ps(ptr_x + 8), max2);
+            min2 = _mm256_min_ps(_mm256_loadu_ps(ptr_x + 8), min2);
+            ptr_x += 16;
+        }
+//        printf("%d %d %d\n", n, n16, i);
+//        exit(0);
+        max1 = _mm256_max_ps(max1, max2);
+        min1 = _mm256_min_ps(min1, min2);
+        for (int j = 0; j < 8; j++) {
+            *tmp_max = *tmp_max < max1[j] ? max1[j] : *tmp_max;
+            *tmp_min = *tmp_min > min1[j] ? min1[j] : *tmp_min;
+        }
+        if ( i < n ) {
+            max_min_float(ptr_x, n - i, tmp_max, tmp_min);
+        }
+    } else {
+        max_min_float(x, n, tmp_max, tmp_min);
+    }
+#else
+    max_min_float(x, n, tmp_max, tmp_min);
+#endif
+}
+
+void computeStateMedianRadius_float2(float *oriData, size_t nbEle, float absErrBound,
+                                     unsigned char *state, float *median, float *radius) {
+     float min = oriData[0];
+     float max = oriData[0];
+     simd_max_min_float(oriData, nbEle, &max, &min);
+
+    float valueRange = max - min;
+    *radius = valueRange / 2;
+    *median = min + *radius;
+
+    if (*radius <= absErrBound) {
+        *state = 0;
+    } else {
+        *state = 1;
+    }
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_float(float *oriData, size_t *outSize, float absErrBound, size_t nbEle,
+                                                  int blockSize) {
+    float *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t stateNBBytes =
+            remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+                                                                                        (nbBlocks + 1) / 8 :
+                                                                                        (nbBlocks + 1) / 8 + 1);
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_float(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 0; // indicates this is not a random access version
+    r[4] = (unsigned char) blockSize;
+    r = r + 5; //1 byte
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'stateNBBytes'
+
+    unsigned char *p = r + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize += (3 + 1 + 1 + sizeof(size_t) + stateNBBytes + sizeof(float) * nbConstantBlocks);
+
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, r);
+	
+    free(stateArray);
+    free(medianArray);	
+    free(radiusArray);
+    free(leadNumberArray_int);
+
+    return outputBytes;
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float_openmp(float *oriData, size_t *outSize, float absErrBound,
+                                                               size_t nbEle, int blockSize) {
+#ifdef _OPENMP
+    printf("use openmp\n");
+
+#ifdef __AVX512F__
+    printf("use avx512\n");
+#elif __AVX2__
+    printf("use avx2\n");
+#else
+#endif
+    printf("blockSize = %d\n",blockSize);
+    sz_cost_start();
+    float *op = oriData;
+
+    size_t i = 0;
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbNonConstantBlocks = 0;
+
+    unsigned char *tmp_q = (unsigned char *) malloc(blockSize * sizeof(float) * actualNBBlocks);
+    int *outSizes = (int *) malloc(actualNBBlocks * sizeof(int));
+    size_t *outSizesAccumlate = (size_t *) malloc(actualNBBlocks * sizeof(size_t));
+    int *nbNonConstantBlockAccumlate = (int *) malloc(actualNBBlocks * sizeof(int));
+
+    (*outSize) = 0;
+    size_t maxPreservedBufferSize =
+    sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //4 byte
+
+    int nbThreads = 1;
+    unsigned char *leadNumberArray_int;
+    size_t z0[200],z1[200];
+
+    size_t nbConstantBlocks;
+    unsigned char *R, *p, *q;
+    float *pf;
+    uint16_t *O;
+
+#pragma omp parallel
+{
+#pragma omp single
+{
+    nbThreads = omp_get_num_threads();
+    printf("nbThreads = %d\n", nbThreads);
+    assert(nbThreads<200);
+    leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int) * nbThreads);
+
+    sz_cost_end_msg("sequential-1 malloc");
+    sz_cost_start();
+}
+#pragma omp for reduction(+:nbNonConstantBlocks) schedule(static)
+    for (i = 0; i < nbBlocks; i++) {
+        float radius;
+        computeStateMedianRadius_float2(op + i * blockSize, blockSize, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op + i * blockSize, blockSize, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int +
+                                                                omp_get_thread_num() * blockSize * sizeof(int),
+                                                                medianArray[i], radius);
+            outSizesAccumlate[i]=outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizes[i]=0;
+            outSizesAccumlate[i]=0;
+        }
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-1 compress");
+//    exit(0);
+    if (remainCount != 0) {
+        i = nbBlocks;
+        float radius;
+        computeStateMedianRadius_float2(op + i * blockSize, remainCount, absErrBound, stateArray + i, medianArray + i,
+                                        &radius);
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op + i * blockSize, remainCount, absErrBound,
+                                                                tmp_q + i * blockSize * sizeof(float), outSizes + i,
+                                                                leadNumberArray_int, medianArray[i], radius);
+            outSizesAccumlate[i] = outSizes[i];
+            nbNonConstantBlocks += 1;
+        }else{
+            outSizesAccumlate[i] = 0;
+            outSizes[i]=0;
+        }
+    }
+
+    nbConstantBlocks = actualNBBlocks - nbNonConstantBlocks;
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t);
+    O = (uint16_t*) r; //o is the starting address of 'block-size array'
+    R = r + nbNonConstantBlocks * sizeof(uint16_t); //R is the starting address of the state array
+    p = R + stateNBBytes; //p is the starting address of constant median values.
+    pf = (float *) p;
+    q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    // unsigned char *q0 = q;
+    // printf("%lu %lu %lu %lu\n",r-outputBytes, R-outputBytes, p-outputBytes, q-outputBytes);
+    // 3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q - outputBytes;
+
+    sz_cost_start();
+
+}
+    int tid = omp_get_thread_num();
+    int lo = tid * actualNBBlocks / nbThreads;
+    int hi = (tid + 1) * actualNBBlocks / nbThreads;
+    int b;
+    nbNonConstantBlockAccumlate[lo]=stateArray[lo];
+    for (b = lo+1; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + outSizesAccumlate[b-1];
+    }
+    for (b = lo+1; b < hi; b++){
+        nbNonConstantBlockAccumlate[b]=stateArray[b]+nbNonConstantBlockAccumlate[b-1];
+    }
+    z0[tid] = outSizesAccumlate[hi-1];
+    z1[tid] = nbNonConstantBlockAccumlate[hi-1];
+    size_t offset0=0, offset1=0;
+#pragma omp barrier
+    for (int j = 0; j < tid; j++) {
+        offset0+=z0[j];
+        offset1+=z1[j];
+    }
+    for (b = lo; b < hi; b++){
+        outSizesAccumlate[b] = outSizesAccumlate[b] + offset0;
+        nbNonConstantBlockAccumlate[b] = nbNonConstantBlockAccumlate[b] + offset1;
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-2 prefix sum");
+    sz_cost_start();
+};
+#pragma omp for schedule(static)
+    for (i = 0; i < actualNBBlocks; i++) {
+        if (stateArray[i]) {
+            memcpy(q+outSizesAccumlate[i]-outSizes[i], tmp_q + i * blockSize * sizeof(float), outSizes[i]);
+            O[nbNonConstantBlockAccumlate[i]-1]=outSizes[i];
+        } else {
+            pf[i-nbNonConstantBlockAccumlate[i]]=medianArray[i];
+        }
+    }
+#pragma omp single
+{
+    sz_cost_end_msg("parallel-3 memcpy");
+    sz_cost_start();
+
+    *outSize += outSizesAccumlate[actualNBBlocks-1];
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+    sz_cost_end_msg("sequential-2 int2byte");
+    sz_cost_start();
+    free(nbNonConstantBlockAccumlate);
+    free(outSizesAccumlate);
+    free(leadNumberArray_int);
+    free(tmp_q);
+    free(medianArray);
+    free(stateArray);
+    free(outSizes);
+    sz_cost_end_msg("sequential-3 free");
+    printf("blocksize = %d, actualNBBlocks = %lu\n", blockSize, actualNBBlocks);
+    printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f * (nbConstantBlocks * blockSize) / nbEle);
+    printf("CR = %.3f, nbEle = %lu \n", nbEle*4.0/(*outSize), nbEle);
+}
+}
+    return outputBytes;
+#else
+    return NULL;
+#endif
+}
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_blocked_randomaccess_float(float *oriData, size_t *outSize, float absErrBound,
+    size_t nbEle, int blockSize) {
+    float *op = oriData;
+
+    *outSize = 0;
+    size_t maxPreservedBufferSize =
+            sizeof(float) * nbEle; //assume that the compressed data size would not exceed the original size
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(blockSize * sizeof(int));
+
+    size_t i = 0;
+    int oSize = 0;
+
+    size_t nbBlocks = nbEle / blockSize;
+    size_t remainCount = nbEle % blockSize;
+    size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+    size_t stateNBBytes = (actualNBBlocks % 8 == 0 ? actualNBBlocks / 8 : actualNBBlocks / 8 + 1);
+
+    unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+    float *medianArray = (float *) malloc(actualNBBlocks * sizeof(float));
+    float *radiusArray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    size_t nbConstantBlocks = computeStateMedianRadius_float(oriData, nbEle, absErrBound, blockSize, stateArray,
+                                                             medianArray, radiusArray);
+
+    size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1;
+    r[3] = 1; //support random access decompression
+    r = r + 4; //1 byte
+
+    sizeToBytes(r, blockSize);
+    r += sizeof(size_t);
+    sizeToBytes(r, nbConstantBlocks);
+    r += sizeof(size_t); //r is the starting address of 'block-size array'
+    uint16_t *O=(uint16_t*)r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //R is the starting address of the state array
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    unsigned char *q =
+            p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data sblocks
+    //3: versions, 1: metadata: state, 1: metadata: blockSize, sizeof(size_t): nbConstantBlocks, ....
+    *outSize = q-outputBytes;
+
+    size_t nonConstantBlockID = 0;
+    //printf("nbConstantBlocks = %zu, percent = %f\n", nbConstantBlocks, 1.0f*(nbConstantBlocks*blockSize)/nbEle);
+    for (i = 0; i < nbBlocks; i++, op += blockSize) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, blockSize, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            q += oSize;
+            *outSize += oSize;
+            O[nonConstantBlockID++] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+            p += sizeof(float);
+        }
+    }
+
+    if (remainCount != 0) {
+        if (stateArray[i]) {
+            SZ_fast_compress_args_unpredictable_one_block_float(op, remainCount, absErrBound, q, &oSize,
+                                                                leadNumberArray_int, medianArray[i], radiusArray[i]);
+            *outSize += oSize;
+            O[nonConstantBlockID] = oSize;
+        } else {
+            floatToBytes(p, medianArray[i]);
+        }
+
+    }
+
+    convertIntArray2ByteArray_fast_1b_args(stateArray, actualNBBlocks, R);
+
+    free(leadNumberArray_int);
+
+    return outputBytes;
+
+}
+
+
+unsigned char *
+SZ_fast_compress_args_unpredictable_float(float *data, size_t *outSize, float absErrBound, size_t r5, size_t r4,
+                                          size_t r3, size_t r2, size_t r1, float mValue, float radius) {
+    size_t totalSize = 0;
+    float medianValue = mValue;
+
+    size_t dataLength = computeDataLength(r5, r4, r3, r2, r1);
+
+    size_t maxPreservedBufferSize =
+            sizeof(float) * dataLength; //assume that the compressed data size would not exceed the original size
+
+    unsigned char *outputBytes = (unsigned char *) malloc(maxPreservedBufferSize);
+    memset(outputBytes, 0, maxPreservedBufferSize);
+    unsigned char *r = outputBytes; // + sizeof(size_t) + stateNBBytes;
+    r[0] = SZx_VER_MAJOR;
+    r[1] = SZx_VER_MINOR;
+    r[2] = 1; //SZx_VER_SUPERFAST
+    r[3] = 0; //support random access decompression
+
+//	sz_cost_start();
+    size_t i;
+    int reqLength;
+    short radExpo = getExponent_float(radius);
+
+    computeReqLength_float(absErrBound, radExpo, &reqLength, &medianValue);
+
+    int reqBytesLength = reqLength / 8;
+    int resiBitsLength = reqLength % 8;
+    int rightShiftBits = 0;
+
+    size_t leadNumberArray_size = dataLength % 4 == 0 ? dataLength / 4 : dataLength / 4 + 1;
+
+    register lfloat lfBuf_pre;
+    register lfloat lfBuf_cur;
+    lfBuf_pre.ivalue = 0;
+
+    unsigned char *leadNumberArray = outputBytes + 4 + 1 + sizeof(float) + sizeof(size_t);
+
+    unsigned char *exactMidbyteArray = leadNumberArray + leadNumberArray_size;
+
+    if (resiBitsLength != 0) {
+        rightShiftBits = 8 - resiBitsLength;
+        reqBytesLength++;
+    }
+
+    register unsigned char leadingNum = 0;
+
+    unsigned char *leadNumberArray_int = (unsigned char *) malloc(dataLength);
+
+    size_t residualMidBytes_size = 0;
+    if (sysEndianType == LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[1];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < dataLength; i++) {
+
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 2;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[2];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[3];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }else //reqBytesLength == 4
+        {
+            for (i = 0; i < dataLength; i++) {
+                leadingNum = 0;
+                lfBuf_cur.value = data[i] - medianValue;
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre.ivalue = lfBuf_cur.ivalue ^ lfBuf_pre.ivalue;
+
+                if (lfBuf_pre.ivalue >> 8 == 0)
+                    leadingNum = 3;
+                else if (lfBuf_pre.ivalue >> 16 == 0)
+                    leadingNum = 2;
+                else if (lfBuf_pre.ivalue >> 24 == 0)
+                    leadingNum = 1;
+
+                leadNumberArray_int[i] = leadingNum;
+
+                if (leadingNum == 0) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    exactMidbyteArray[residualMidBytes_size + 3] = lfBuf_cur.byte[3];
+                    residualMidBytes_size += 4;
+                } else if (leadingNum == 1) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    exactMidbyteArray[residualMidBytes_size + 2] = lfBuf_cur.byte[2];
+                    residualMidBytes_size += 3;
+                } else if (leadingNum == 2) {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    exactMidbyteArray[residualMidBytes_size + 1] = lfBuf_cur.byte[1];
+                    residualMidBytes_size += 2;
+                } else //leadingNum == 3
+                {
+                    exactMidbyteArray[residualMidBytes_size] = lfBuf_cur.byte[0];
+                    residualMidBytes_size++;
+                }
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+
+        convertIntArray2ByteArray_fast_2b_args(leadNumberArray_int, dataLength, leadNumberArray);
+
+        int k = 4;
+
+        unsigned char reqLengthB = (unsigned char) reqLength;
+        outputBytes[k] = reqLengthB;
+        k++;
+        floatToBytes(&(outputBytes[k]), medianValue);
+        k += sizeof(float);
+        sizeToBytes(&(outputBytes[k]), leadNumberArray_size);
+
+        totalSize = 4 + 1 + sizeof(float) + sizeof(size_t) + leadNumberArray_size + residualMidBytes_size;
+    } else {
+
+    }
+
+    *outSize = totalSize;
+
+    free(leadNumberArray_int);
+//	sz_cost_end();
+//	printf("compression time = %f\n", sz_totalCost);
+
+    return outputBytes;
+}
+
+unsigned char *SZ_skip_compress_float(float *data, size_t dataLength, size_t *outSize) {
+    *outSize = dataLength * sizeof(float);
+    unsigned char *out = (unsigned char *) malloc(dataLength * sizeof(float));
+    memcpy(out, data, dataLength * sizeof(float));
+    return out;
+}
+
+inline void computeReqLength_float(double realPrecision, short radExpo, int *reqLength, float *medianValue) {
+    short reqExpo = getPrecisionReqLength_double(realPrecision);
+    *reqLength = 9 + radExpo - reqExpo + 1; //radExpo-reqExpo == reqMantiLength
+    if (*reqLength < 9)
+        *reqLength = 9;
+    if (*reqLength > 32) {
+        *reqLength = 32;
+        *medianValue = 0;
+    }
+}
diff --git a/qtensor/compression/szx/src/szx_rw.c b/qtensor/compression/szx/src/szx_rw.c
new file mode 100644
index 00000000..8e3e92a3
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_rw.c
@@ -0,0 +1,1009 @@
+/**
+ *  @file szx_rw.c
+ *  @author Sheng Di
+ *  @date April, 2022
+ *  @brief io interface for fortrance
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "szx_rw.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_dataCompression.h"
+
+int checkFileExistance(char* filePath)
+{
+	if( access( filePath, F_OK ) != -1 ) {
+		// file exists
+		return 1;
+	} else {
+		// file doesn't exist
+		return 0;
+	}	
+}
+
+float** create2DArray_float(size_t m, size_t n)
+{
+	size_t i=0;
+	float **data = (float**)malloc(sizeof(float*)*m);
+	for(i=0;i<m;i++)
+		data[i] = (float*)malloc(sizeof(float)*n);
+	return data;
+}
+
+void free2DArray_float(float** data, size_t m)
+{
+	size_t i = 0;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+float*** create3DArray_float(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	float ***data = (float***)malloc(sizeof(float**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (float**)malloc(sizeof(float*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (float*)malloc(sizeof(float)*n);
+	}
+	return data;
+}
+
+void free3DArray_float(float*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+double** create2DArray_double(size_t m, size_t n)
+{
+	size_t i=0;
+	double **data = (double**)malloc(sizeof(double*)*m);
+	for(i=0;i<m;i++)
+			data[i] = (double*)malloc(sizeof(double)*n);
+			
+	return data;
+}
+
+void free2DArray_double(double** data, size_t m)
+{
+	size_t i;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+double*** create3DArray_double(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	double ***data = (double***)malloc(sizeof(double**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (double**)malloc(sizeof(double*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (double*)malloc(sizeof(double)*n);
+	}
+	return data;
+}
+
+void free3DArray_double(double*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+size_t checkFileSize(char *srcFilePath, int *status)
+{
+	size_t filesize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return -1;
+	}
+	fseek(pFile, 0, SEEK_END);
+    filesize = ftell(pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return filesize;
+}
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return byteBuf;
+}
+
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		double *daBuf = readDoubleData_systemEndian(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+		
+		ldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	int8_t *daBuf = readInt8Data_systemEndian(srcFilePath, nbEle, &state);
+	*status = state;
+	return daBuf;
+}
+
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int16_t *daBuf = readInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int16_t *daBuf = (int16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.svalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint16_t *daBuf = readUInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint16_t *daBuf = (uint16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.usvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int32_t *daBuf = readInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int32_t *daBuf = (int32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.ivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint32_t *daBuf = readUInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint32_t *daBuf = (uint32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 2; //*4
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.uivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int64_t *daBuf = readInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int64_t *daBuf = (int64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.lvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint64_t *daBuf = readUInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint64_t *daBuf = (uint64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.ulvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		float *daBuf = readFloatData_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		lfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+    
+    double *daBuf = (double *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize;
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int8_t *daBuf = (int8_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 1, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int16_t *daBuf = (int16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint16_t *daBuf = (uint16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int32_t *daBuf = (int32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint32_t *daBuf = (uint32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int64_t *daBuf = (int64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint64_t *daBuf = (uint64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    for(i = 0;i<nbEle;i++)
+	{
+		sprintf(s,"%.20G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+   
+    for(i = 0;i<nbEle;i++)
+	{
+		//printf("i=%d\n",i);
+		//printf("data[i]=%f\n",data[i]);
+		sprintf(s,"%.30G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	if(dataType == SZ_FLOAT)
+	{
+		float* dataArray = (float *)data;
+		writeFloatData(dataArray, nbEle, tgtFilePath, &state);
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* dataArray = (double *)data;
+		writeDoubleData(dataArray, nbEle, tgtFilePath, &state);	
+	}
+	else
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		*status = SZ_TERR; //wrong type
+		return;
+	}
+	*status = state;
+}
+
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = SZ_SCES;
+	lfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0; 
+	int state = SZ_SCES;
+	ldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertLongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertULongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status)
+{
+	size_t byteLength = 0; 
+	int state = SZ_SCES;
+	unsigned char * bytes = readByteData(srcFilePath, &byteLength, &state);
+	*dataLength = byteLength/2;
+	unsigned short* states = convertByteDataToUShortArray(bytes, byteLength);
+	free(bytes);
+	*status = state;
+	return states;
+}
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[256];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_FERR;
+		return;
+	}
+
+	for(i = 0;i<nbStr;i++)
+	{
+		sprintf(s,"%s\n",str[i]);
+		fputs(s, pFile);
+	}
+
+	fclose(pFile);
+	*status = SZ_SCES;
+}
+
diff --git a/qtensor/compression/szx/src/szx_utility.c b/qtensor/compression/szx/src/szx_utility.c
new file mode 100644
index 00000000..da49c16b
--- /dev/null
+++ b/qtensor/compression/szx/src/szx_utility.c
@@ -0,0 +1,42 @@
+/**
+ *  @file szx_utility.c
+ *  @author Sheng Di
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "szx_utility.h"
+
+struct timeval sz_costStart; /*only used for recording the cost*/
+double sz_totalCost = 0;
+
+void sz_cost_start()
+{
+	sz_totalCost = 0;
+	gettimeofday(&sz_costStart, NULL);
+}
+
+void sz_cost_end()
+{
+	double elapsed;
+	struct timeval costEnd;
+	gettimeofday(&costEnd, NULL);
+	elapsed = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(sz_costStart.tv_sec*1000000+sz_costStart.tv_usec))/1000000.0;
+	sz_totalCost += elapsed;
+}
+
+void sz_cost_end_msg(char *msg)
+{
+    double elapsed;
+    struct timeval costEnd;
+    gettimeofday(&costEnd, NULL);
+    elapsed = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(sz_costStart.tv_sec*1000000+sz_costStart.tv_usec))/1000000.0;
+    sz_totalCost += elapsed;
+    printf("timecost=%f, %s\n", elapsed, msg);
+}
diff --git a/qtensor/compression/szx/src/szxd_double.c b/qtensor/compression/szx/src/szxd_double.c
new file mode 100644
index 00000000..b04bca20
--- /dev/null
+++ b/qtensor/compression/szx/src/szxd_double.c
@@ -0,0 +1,1104 @@
+/**
+ *  @file szxd_double.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szxd_double.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+int SZ_fast_decompress_args_unpredictable_one_block_double(double* newData, size_t blockSize, unsigned char* cmpBytes)
+{
+	int cmpSize = 0;
+	size_t nbEle = blockSize;
+	
+	register double medianValue;
+	size_t leadNumArray_size = nbEle%4==0?nbEle/4:nbEle/4+1;
+	
+	size_t k = 0;
+	int reqLength = (int)cmpBytes[k];
+	k++;
+	medianValue = (double)bytesToFloat(&(cmpBytes[k]));
+	k+=sizeof(float);
+	
+	unsigned char* leadNumArray = &(cmpBytes[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(cmpBytes[k]);	
+	unsigned char* q = residualMidBytes;
+		
+	cmpSize = k;	
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register ldouble lfBuf_pre;
+	register ldouble lfBuf_cur;
+	
+	lfBuf_pre.lvalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+	{
+		//reqBytesLength must not be equal to 1 for double data
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];					
+					lfBuf_cur.byte[7] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[6] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[6] = q[0];					
+					lfBuf_cur.byte[7] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 4)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = q[1];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];					
+					lfBuf_cur.byte[7] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		} 		
+		else if(reqBytesLength == 5)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];
+					lfBuf_cur.byte[6] = q[3];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = q[2];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[3] = q[0];			
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 2;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];					
+					lfBuf_cur.byte[6] = q[3];		
+					lfBuf_cur.byte[7] = q[3];				
+					q += 5;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 6)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];
+					lfBuf_cur.byte[6] = q[4];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = q[3];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[2] = q[0];			
+					lfBuf_cur.byte[3] = q[1];						
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 3;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];						
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];					
+					lfBuf_cur.byte[6] = q[4];		
+					lfBuf_cur.byte[7] = q[5];				
+					q += 6;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 7)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[1] = q[0];		
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];
+					lfBuf_cur.byte[6] = q[5];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[1] = q[0];							
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = q[4];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = q[0];			
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];						
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 4;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];					
+					lfBuf_cur.byte[2] = q[1];						
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];					
+					lfBuf_cur.byte[6] = q[5];		
+					lfBuf_cur.byte[7] = q[6];				
+					q += 7;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}		
+		else //reqBytesLength == 8
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{
+					lfBuf_cur.byte[0] = q[0];								
+					lfBuf_cur.byte[1] = q[1];		
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];
+					lfBuf_cur.byte[6] = q[6];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 7;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];							
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = q[5];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];			
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];						
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 5;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];					
+					lfBuf_cur.byte[2] = q[2];						
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];					
+					lfBuf_cur.byte[6] = q[6];		
+					lfBuf_cur.byte[7] = q[7];				
+					q += 8;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}				
+	}
+	else
+	{
+		
+	}
+	
+	cmpSize += (q - residualMidBytes); //add the number of residualMidBytes
+	return cmpSize;
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_double(double** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+	*newData = (double*)malloc(sizeof(double)*nbEle);
+
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+	convertByteArray2IntArray_fast_1b_args(actualNBBlocks, r, stateNBBytes, stateArray); //get the stateArray
+
+	unsigned char* p = r + stateNBBytes; //p is the starting address of constant median values.
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	double* op = *newData;
+
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			int cmpSize = SZ_fast_decompress_args_unpredictable_one_block_double(op, blockSize, q);
+			q += cmpSize;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_double(op, remainCount, q);
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}
+	}
+
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double_openmp(double** newData, size_t nbEle, unsigned char* cmpBytes) {
+
+	*newData = (double *) malloc(sizeof(double) * nbEle);
+	sz_cost_start();
+	unsigned char *r = cmpBytes;
+	r += 4; //skip version information
+	int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle / blockSize;
+	size_t remainCount = nbEle % blockSize;
+	size_t stateNBBytes =
+			remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+																						(nbBlocks + 1) / 8 :
+																						(nbBlocks + 1) / 8 + 1);
+	size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+	unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+//	double *constantMedianArray = (double *) malloc(nbConstantBlocks * sizeof(double));
+    unsigned char **qarray = (unsigned char **) malloc(actualNBBlocks * sizeof(unsigned char *));
+    float *parray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    float *constantMedianArray = (float *) p;
+    unsigned char *q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data blocks
+    double *op = *newData;
+
+	size_t nonConstantBlockID = 0, constantBlockID = 0;
+//    sz_cost_end_msg("sequential-1 malloc");
+
+//    sz_cost_start();
+    size_t i = 0;// k = 0; //k is used to keep track of constant block index
+//    for (i = 0; i < nbConstantBlocks; i++, k += 4) //get the median values for constant-value blocks
+//        constantMedianArray[i] = bytesToFloat(p + k);
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+//    sz_cost_end_msg("sequential-2 byte to int");
+
+//    sz_cost_start();
+    for (i = 0; i < actualNBBlocks; i++) {
+		if (stateArray[i]) {
+			qarray[i] = q;
+
+			q += O[nonConstantBlockID++];
+		} else {
+			parray[i] = constantMedianArray[constantBlockID++];
+		}
+	}
+
+//   sz_cost_end_msg("sequential-3 sum");
+//	sz_cost_start();
+#pragma omp parallel for schedule(static)
+	for (i = 0; i < nbBlocks; i++) {
+		if (stateArray[i]) {//non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_double(op + i * blockSize, blockSize, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < blockSize; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+//	sz_cost_end_msg("parallel-1");
+
+//	sz_cost_start();
+	if (remainCount) {
+        i = nbBlocks;
+        if (stateArray[i]) { //non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_double(op + i * blockSize, remainCount, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < remainCount; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+
+	free(parray);
+	free(qarray);
+	free(stateArray);
+//	free(constantMedianArray);
+//	sz_cost_end_msg("sequence-3 free");
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_double(double** newData, size_t nbEle, unsigned char* cmpBytes){
+	*newData = (double*)malloc(sizeof(double)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r+=4; //skip version information
+    int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+		
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+	
+
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char* R = r+ nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+	
+	unsigned char* p = R + stateNBBytes; //p is the starting address of constant median values.
+	
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	double* op = *newData;
+
+	size_t nonConstantBlockID=0;
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+            SZ_fast_decompress_args_unpredictable_one_block_double(op, blockSize, q);
+            q += O[nonConstantBlockID];
+            nonConstantBlockID++;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];			
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_double(op, remainCount, q);	
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];				
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}		
+	}
+	
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes,
+size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	*newData = (double*)malloc(sizeof(double)*nbEle);	
+	
+	register double medianValue;
+	size_t leadNumArray_size;
+
+    unsigned char *r = cmpBytes;
+    r += 4; //skip version information
+
+	size_t k = 0;
+	int reqLength = (int)r[k];
+	k++;
+	medianValue = (double)bytesToFloat(&(r[k]));
+	k+=sizeof(float);
+	leadNumArray_size = bytesToSize(&(r[k]));
+	k+=sizeof(size_t);
+	
+	unsigned char* leadNumArray = &(r[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(r[k]);
+	unsigned char* q = residualMidBytes;
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register ldouble lfBuf_pre;
+	register ldouble lfBuf_cur;
+	
+	lfBuf_pre.lvalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM) {
+
+		//reqBytesLength must not be equal to 1 for double data
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[5] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[5] = q[0];
+					lfBuf_cur.byte[6] = q[1];					
+					lfBuf_cur.byte[7] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];
+					lfBuf_cur.byte[6] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[6] = q[0];					
+					lfBuf_cur.byte[7] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 4)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = q[1];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[4] = q[0];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[4] = q[0];
+					lfBuf_cur.byte[5] = q[1];
+					lfBuf_cur.byte[6] = q[2];					
+					lfBuf_cur.byte[7] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		} 		
+		else if(reqBytesLength == 5)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];
+					lfBuf_cur.byte[6] = q[3];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = q[2];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 3;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[3] = q[0];			
+					lfBuf_cur.byte[4] = q[1];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 2;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];
+					lfBuf_cur.byte[4] = q[1];
+					lfBuf_cur.byte[5] = q[2];					
+					lfBuf_cur.byte[6] = q[3];		
+					lfBuf_cur.byte[7] = q[3];				
+					q += 5;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 6)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];
+					lfBuf_cur.byte[6] = q[4];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = q[3];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 4;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[2] = q[0];			
+					lfBuf_cur.byte[3] = q[1];						
+					lfBuf_cur.byte[4] = q[2];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 3;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];						
+					lfBuf_cur.byte[3] = q[1];
+					lfBuf_cur.byte[4] = q[2];
+					lfBuf_cur.byte[5] = q[3];					
+					lfBuf_cur.byte[6] = q[4];		
+					lfBuf_cur.byte[7] = q[5];				
+					q += 6;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+		else if(reqBytesLength == 7)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[1] = q[0];		
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];
+					lfBuf_cur.byte[6] = q[5];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[1] = q[0];							
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = q[4];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 5;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = q[0];			
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];						
+					lfBuf_cur.byte[4] = q[3];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 4;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];					
+					lfBuf_cur.byte[2] = q[1];						
+					lfBuf_cur.byte[3] = q[2];
+					lfBuf_cur.byte[4] = q[3];
+					lfBuf_cur.byte[5] = q[4];					
+					lfBuf_cur.byte[6] = q[5];		
+					lfBuf_cur.byte[7] = q[6];				
+					q += 7;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}		
+		else //reqBytesLength == 8
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{
+					lfBuf_cur.byte[0] = q[0];								
+					lfBuf_cur.byte[1] = q[1];		
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];
+					lfBuf_cur.byte[6] = q[6];				
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 7;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];							
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = q[5];									
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];					
+					q += 6;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];			
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];						
+					lfBuf_cur.byte[4] = q[4];									
+					lfBuf_cur.byte[5] = lfBuf_pre.byte[5];
+					lfBuf_cur.byte[6] = lfBuf_pre.byte[6];
+					lfBuf_cur.byte[7] = lfBuf_pre.byte[7];	
+					q += 5;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];						
+					lfBuf_cur.byte[1] = q[1];					
+					lfBuf_cur.byte[2] = q[2];						
+					lfBuf_cur.byte[3] = q[3];
+					lfBuf_cur.byte[4] = q[4];
+					lfBuf_cur.byte[5] = q[5];					
+					lfBuf_cur.byte[6] = q[6];		
+					lfBuf_cur.byte[7] = q[7];				
+					q += 8;
+				}
+
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue << rightShiftBits;
+				(*newData)[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.lvalue = lfBuf_cur.lvalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+
+
+
+    }
+	
+	//sz_cost_end();
+	//printf("totalCost = %f\n", sz_totalCost);
+	//free(leadNum);
+	
+}
diff --git a/qtensor/compression/szx/src/szxd_float.c b/qtensor/compression/szx/src/szxd_float.c
new file mode 100644
index 00000000..63d6ad6e
--- /dev/null
+++ b/qtensor/compression/szx/src/szxd_float.c
@@ -0,0 +1,654 @@
+/**
+ *  @file szxd_float.c
+ *  @author Sheng Di, Kai Zhao
+ *  @date Feb, 2022
+ *  @brief 
+ *  (C) 2022 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szxd_float.h"
+#include "szx.h"
+#include "szx_BytesToolkit.h"
+#include "szx_TypeManager.h"
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+void SZ_fast_decompress_args_with_prediction_float(float** newData, float* pred, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	SZ_fast_decompress_args_unpredictable_float(newData, r5, r4, r3, r2, r1, cmpBytes, cmpSize);
+	size_t i = 0;
+	for(i=0;i<nbEle;i++)
+		(*newData)[i] += pred[i];
+}
+
+int SZ_fast_decompress_args_unpredictable_one_block_float(float* newData, size_t blockSize, unsigned char* cmpBytes)
+{
+	int cmpSize = 0;
+	size_t nbEle = blockSize;
+	
+	register float medianValue;
+	size_t leadNumArray_size = nbEle%4==0?nbEle/4:nbEle/4+1;
+	
+	size_t k = 0;
+	int reqLength = (int)cmpBytes[k];
+	k++;
+	medianValue = bytesToFloat(&(cmpBytes[k]));
+	k+=sizeof(float);
+	
+	unsigned char* leadNumArray = &(cmpBytes[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(cmpBytes[k]);	
+	unsigned char* q = residualMidBytes;
+		
+	cmpSize = k;	
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register lfloat lfBuf_pre;
+	register lfloat lfBuf_cur;
+	
+	lfBuf_pre.ivalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+	{
+		if(reqBytesLength == 3)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[1] = q[0];
+					lfBuf_cur.byte[2] = q[1];				
+					q += 2;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[1] = q[0];									
+					q += 1;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[1] = q[0];
+					lfBuf_cur.byte[2] = q[1];					
+					lfBuf_cur.byte[3] = q[2];					
+					q += 3;
+				}
+
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}
+		}
+		else if(reqBytesLength == 2)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+	
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+					lfBuf_cur.byte[2] = q[0];			
+					q += 1;	
+				}
+				else if(leadingNum >= 2)
+				{
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];									
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[2] = q[0];					
+					lfBuf_cur.byte[3] = q[1];					
+					q += 2;
+				}
+				
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}					
+		}
+		else if(reqBytesLength == 1)
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum != 0) //>=1
+				{	
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[3] = q[0];				
+					q += 1;	
+				}
+				
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;
+			}				
+		}
+		else //reqBytesLength == 4
+		{
+			for(i=0;i < nbEle;i++)
+			{
+				lfBuf_cur.value = 0;
+				
+				j = (i >> 2); //i/4
+				k = (i & 0x03) << 1; //(i%4)*2
+				leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+				
+				if(leadingNum == 1)
+				{	
+					lfBuf_cur.byte[0] = q[0];
+					lfBuf_cur.byte[1] = q[1];
+					lfBuf_cur.byte[2] = q[2];				
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];					
+					q += 3;
+				}
+				else if(leadingNum == 2)
+				{
+					lfBuf_cur.byte[0] = q[0];									
+					lfBuf_cur.byte[1] = q[1];									
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];					
+					q += 2;
+				}
+				else if(leadingNum == 3)
+				{
+					lfBuf_cur.byte[0] = q[0];									
+					lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+					lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+					lfBuf_cur.byte[3] = lfBuf_pre.byte[3];	
+					q += 1;				
+				}
+				else //==0
+				{
+					lfBuf_cur.byte[0] = q[0];
+					lfBuf_cur.byte[1] = q[1];
+					lfBuf_cur.byte[2] = q[2];					
+					lfBuf_cur.byte[3] = q[3];					
+					q += 4;
+				}
+
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+				newData[i] = lfBuf_cur.value + medianValue;
+				lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+				
+				lfBuf_pre = lfBuf_cur;			
+			}
+		}
+	}
+	else
+	{
+		
+	}
+	
+	cmpSize += (q - residualMidBytes); //add the number of residualMidBytes
+	return cmpSize;
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_float(float** newData, size_t nbEle, unsigned char* cmpBytes)
+{
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+
+	unsigned char* r = cmpBytes;
+	r += 4;
+	int blockSize = r[0];  //get block size
+	r++;
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+	convertByteArray2IntArray_fast_1b_args(actualNBBlocks, r, stateNBBytes, stateArray); //get the stateArray
+
+	unsigned char* p = r + stateNBBytes; //p is the starting address of constant median values.
+
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	float* op = *newData;
+
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			int cmpSize = SZ_fast_decompress_args_unpredictable_one_block_float(op, blockSize, q);
+			q += cmpSize;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_float(op, remainCount, q);
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}
+	}
+
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float_openmp(float** newData, size_t nbEle, unsigned char* cmpBytes) {
+
+	*newData = (float *) malloc(sizeof(float) * nbEle);
+	sz_cost_start();
+	unsigned char *r = cmpBytes;
+	r += 4; //skip version information
+	int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+
+	size_t nbBlocks = nbEle / blockSize;
+	size_t remainCount = nbEle % blockSize;
+	size_t stateNBBytes =
+			remainCount == 0 ? (nbBlocks % 8 == 0 ? nbBlocks / 8 : nbBlocks / 8 + 1) : ((nbBlocks + 1) % 8 == 0 ?
+																						(nbBlocks + 1) / 8 :
+																						(nbBlocks + 1) / 8 + 1);
+	size_t actualNBBlocks = remainCount == 0 ? nbBlocks : nbBlocks + 1;
+
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+
+	unsigned char *stateArray = (unsigned char *) malloc(actualNBBlocks);
+//	float *constantMedianArray = (float *) malloc(nbConstantBlocks * sizeof(float));
+    unsigned char **qarray = (unsigned char **) malloc(actualNBBlocks * sizeof(unsigned char *));
+    float *parray = (float *) malloc(actualNBBlocks * sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char *R = r + nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+    unsigned char *p = R + stateNBBytes; //p is the starting address of constant median values.
+    float *constantMedianArray = (float *) p;
+    unsigned char *q = p + sizeof(float) * nbConstantBlocks; //q is the starting address of the non-constant data blocks
+    float *op = *newData;
+
+	size_t nonConstantBlockID = 0, constantBlockID = 0;
+    sz_cost_end_msg("sequential-1 malloc");
+
+    sz_cost_start();
+    size_t i = 0;// k = 0; //k is used to keep track of constant block index
+//    for (i = 0; i < nbConstantBlocks; i++, k += 4) //get the median values for constant-value blocks
+//        constantMedianArray[i] = bytesToFloat(p + k);
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+    sz_cost_end_msg("sequential-2 byte to int");
+
+    sz_cost_start();
+    for (i = 0; i < actualNBBlocks; i++) {
+		if (stateArray[i]) {
+			qarray[i] = q;
+
+			q += O[nonConstantBlockID++];
+		} else {
+			parray[i] = constantMedianArray[constantBlockID++];
+		}
+	}
+
+    sz_cost_end_msg("sequential-3 sum");
+	sz_cost_start();
+#pragma omp parallel for schedule(static)
+	for (i = 0; i < nbBlocks; i++) {
+		if (stateArray[i]) {//non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_float(op + i * blockSize, blockSize, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < blockSize; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+	sz_cost_end_msg("parallel-1");
+
+	sz_cost_start();
+	if (remainCount) {
+        i = nbBlocks;
+        if (stateArray[i]) { //non-constant block
+			SZ_fast_decompress_args_unpredictable_one_block_float(op + i * blockSize, remainCount, qarray[i]);
+		} else {//constant block
+			for (int j = 0; j < remainCount; j++)
+				op[i * blockSize + j] = parray[i];
+		}
+	}
+
+	free(parray);
+	free(qarray);
+	free(stateArray);
+//	free(constantMedianArray);
+	sz_cost_end_msg("sequence-3 free");
+}
+
+
+void SZ_fast_decompress_args_unpredictable_blocked_randomaccess_float(float** newData, size_t nbEle, unsigned char* cmpBytes){
+	*newData = (float*)malloc(sizeof(float)*nbEle);
+	
+	unsigned char* r = cmpBytes;
+	r+=4; //skip version information
+    int blockSize = bytesToLong_bigEndian(r);  //get block size
+    r += sizeof(size_t);
+	size_t nbConstantBlocks = bytesToLong_bigEndian(r); //get number of constant blocks
+	r += sizeof(size_t);
+		
+	size_t nbBlocks = nbEle/blockSize;
+	size_t remainCount = nbEle%blockSize;
+	size_t stateNBBytes = remainCount == 0 ? (nbBlocks%8==0?nbBlocks/8:nbBlocks/8+1) : ((nbBlocks+1)%8==0? (nbBlocks+1)/8:(nbBlocks+1)/8+1);
+	size_t actualNBBlocks = remainCount==0 ? nbBlocks : nbBlocks+1;
+	
+	size_t nbNonConstantBlocks = actualNBBlocks - nbConstantBlocks;
+	
+
+	unsigned char* stateArray = (unsigned char*)malloc(actualNBBlocks);
+	float* constantMedianArray = (float*)malloc(nbConstantBlocks*sizeof(float));
+
+    int16_t* O = (int16_t*) r;
+    unsigned char* R = r+ nbNonConstantBlocks*sizeof(uint16_t); //block-size information
+
+    convertByteArray2IntArray_fast_1b_args(actualNBBlocks, R, stateNBBytes, stateArray); //get the stateArray
+	
+	unsigned char* p = R + stateNBBytes; //p is the starting address of constant median values.
+	
+	size_t i = 0, j = 0, k = 0; //k is used to keep track of constant block index
+	for(i = 0;i < nbConstantBlocks;i++, j+=4) //get the median values for constant-value blocks
+		constantMedianArray[i] = bytesToFloat(p+j);
+
+	unsigned char* q = p + sizeof(float)*nbConstantBlocks; //q is the starting address of the non-constant data blocks
+	float* op = *newData;
+
+	size_t nonConstantBlockID=0;
+	for(i=0;i<nbBlocks;i++, op += blockSize)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+            SZ_fast_decompress_args_unpredictable_one_block_float(op, blockSize, q);
+            q += O[nonConstantBlockID];
+            nonConstantBlockID++;
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];			
+			for(j=0;j<blockSize;j++)
+				op[j] = medianValue;
+			p += sizeof(float);
+			k ++;
+		}
+	}
+
+	if(remainCount)
+	{
+		unsigned char state = stateArray[i];
+		if(state) //non-constant block
+		{
+			SZ_fast_decompress_args_unpredictable_one_block_float(op, remainCount, q);	
+		}
+		else //constant block
+		{
+			float medianValue = constantMedianArray[k];				
+			for(j=0;j<remainCount;j++)
+				op[j] = medianValue;
+		}		
+	}
+	
+	free(stateArray);
+	free(constantMedianArray);
+}
+
+void SZ_fast_decompress_args_unpredictable_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes,
+size_t cmpSize)
+{
+	size_t nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	*newData = (float*)malloc(sizeof(float)*nbEle);	
+	
+	register float medianValue;
+	size_t leadNumArray_size;
+
+    unsigned char *r = cmpBytes;
+    r += 4; //skip version information
+
+	size_t k = 0;
+	int reqLength = (int)r[k];
+	k++;
+	medianValue = bytesToFloat(&(r[k]));
+	k+=sizeof(float);
+	leadNumArray_size = bytesToSize(&(r[k]));
+	k+=sizeof(size_t);
+	
+	unsigned char* leadNumArray = &(r[k]);
+	k += leadNumArray_size;
+	unsigned char* residualMidBytes = &(r[k]);
+	unsigned char* q = residualMidBytes;
+		
+	size_t i = 0, j = 0;
+	k = 0;
+	
+	register lfloat lfBuf_pre;
+	register lfloat lfBuf_cur;
+	
+	lfBuf_pre.ivalue = 0;
+
+	int reqBytesLength, resiBitsLength; 
+	register unsigned char leadingNum;
+
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+	int rightShiftBits = 0;
+	
+	if(resiBitsLength!=0)
+	{
+		rightShiftBits = 8 - resiBitsLength;
+		reqBytesLength ++;
+	}
+	
+	//sz_cost_start();
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM) {
+        if (reqBytesLength == 3) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[1] = q[0];
+                    lfBuf_cur.byte[2] = q[1];
+                    q += 2;
+                } else if (leadingNum == 2) {
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[1] = q[0];
+                    q += 1;
+                } else if (leadingNum == 3) {
+                    lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[1] = q[0];
+                    lfBuf_cur.byte[2] = q[1];
+                    lfBuf_cur.byte[3] = q[2];
+                    q += 3;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else if (reqBytesLength == 2) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    lfBuf_cur.byte[2] = q[0];
+                    q += 1;
+                } else if (leadingNum >= 2) {
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[2] = q[0];
+                    lfBuf_cur.byte[3] = q[1];
+                    q += 2;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+
+            }
+        } else if (reqBytesLength == 1) {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum != 0) //>=1
+                {
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                } else //==0
+                {
+                    lfBuf_cur.byte[3] = q[0];
+                    q += 1;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        } else {
+            for (i = 0; i < nbEle; i++) {
+                lfBuf_cur.value = 0;
+
+                j = (i >> 2); //i/4
+                k = (i & 0x03) << 1; //(i%4)*2
+                leadingNum = (leadNumArray[j] >> (6 - k)) & 0x03;
+
+                if (leadingNum == 1) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = q[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 3;
+                } else if (leadingNum == 2) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 2;
+                } else if (leadingNum == 3) {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = lfBuf_pre.byte[1];
+                    lfBuf_cur.byte[2] = lfBuf_pre.byte[2];
+                    lfBuf_cur.byte[3] = lfBuf_pre.byte[3];
+                    q += 1;
+                } else //==0
+                {
+                    lfBuf_cur.byte[0] = q[0];
+                    lfBuf_cur.byte[1] = q[1];
+                    lfBuf_cur.byte[2] = q[2];
+                    lfBuf_cur.byte[3] = q[3];
+                    q += 4;
+                }
+
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue << rightShiftBits;
+                (*newData)[i] = lfBuf_cur.value + medianValue;
+                lfBuf_cur.ivalue = lfBuf_cur.ivalue >> rightShiftBits;
+
+                lfBuf_pre = lfBuf_cur;
+            }
+        }
+    }
+	
+	//sz_cost_end();
+	//printf("totalCost = %f\n", sz_totalCost);
+	//free(leadNum);
+	
+}
diff --git a/qtensor/compression/szx/src/timingGPU.cu b/qtensor/compression/szx/src/timingGPU.cu
new file mode 100644
index 00000000..dc390510
--- /dev/null
+++ b/qtensor/compression/szx/src/timingGPU.cu
@@ -0,0 +1,45 @@
+/**************/
+/* TIMING GPU */
+/**************/
+
+#include "timingGPU.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct PrivateTimingGPU {
+    cudaEvent_t start;
+    cudaEvent_t stop;
+};
+
+// default constructor
+TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU;  }
+
+// default destructor
+TimingGPU::~TimingGPU() { }
+
+void TimingGPU::StartCounter()
+{
+    cudaEventCreate(&((*privateTimingGPU).start));
+    cudaEventCreate(&((*privateTimingGPU).stop));
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+void TimingGPU::StartCounterFlags()
+{
+    int eventflags = cudaEventBlockingSync;
+
+    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
+    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
+    cudaEventRecord((*privateTimingGPU).start,0);
+}
+
+// Gets the counter in ms
+float TimingGPU::GetCounter()
+{
+    float time;
+    cudaEventRecord((*privateTimingGPU).stop, 0);
+    cudaEventSynchronize((*privateTimingGPU).stop);
+    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
+    return time;
+}
diff --git a/qtensor/compression/tests/test_compressed_contract.py b/qtensor/compression/tests/test_compressed_contract.py
new file mode 100644
index 00000000..05af6de4
--- /dev/null
+++ b/qtensor/compression/tests/test_compressed_contract.py
@@ -0,0 +1,76 @@
+from qtensor.compression import compressed_contract, compressed_sum, CompressedTensor, Tensor
+from qtensor.compression import NumpyCompressor
+from qtree.optimizer import Var
+import numpy as np
+
+
+def test_compressed_contract():
+    compressor = NumpyCompressor()
+    A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
+    A_comp = [Var(x) for x in [8, 7, 6]]
+    B_ixs = [Var(x) for x in [10, 9, 3, 4, 2]]
+    contract_ixs = [Var(x) for x in [3,2]]
+
+    A_data = np.ones(2**len(A_ixs))
+    #A_data = np.random.randn(2**len(A_ixs))
+    A_data = A_data.reshape(*(v.size for v in A_ixs))
+    A_data[1, 1] *= 2
+    A_data[0, 1] *= 2
+    A_data[:, :, :, 1] *= 1.2
+    B_data = np.ones(2**len(B_ixs))*1.2
+    #B_data = np.random.randn(2**len(B_ixs))*1.2
+    B_data = B_data.reshape(*(v.size for v in B_ixs))
+
+    A = CompressedTensor('A', A_ixs, data=A_data)
+    A.compress_indices(A_comp)
+    B = Tensor('B', B_ixs, data=B_data)
+    print(f"Tensor A: {A}")
+    print(f"Tensor B: {B}")
+
+    res_ixs = list(set(A_ixs).union(B_ixs) - set(contract_ixs))
+    res_ixs.sort(key=int, reverse=True)
+    res = compressed_contract(A, B, contract_ixs,
+                              mem_limit=3, compressor=compressor)
+    print(f"Resulting Tensor: {res}")
+
+    res = compressed_contract(A, B, contract_ixs,
+                              mem_limit=10, compressor=compressor)
+
+    print(f"Resulting Tensor: {res}")
+    print(res.data.flatten())
+
+
+    A_str = ''.join(chr(97+int(v)) for v in A_ixs)
+    B_str = ''.join(chr(97+int(v)) for v in B_ixs)
+    C_str = ''.join(chr(97+int(v)) for v in res_ixs)
+    expr = f"{A_str},{B_str}->{C_str}"
+    C = np.einsum(expr, A_data, B_data)
+    print(f"Ground truth:")
+    print( C.flatten())
+    
+    assert np.allclose(C, res.data)
+    print("Success!")
+
+def test_compressed_sum():
+    A_ixs = [Var(x) for x in [8,7,6,5,4,3, 2]]
+    A_comp = [Var(x) for x in [8, 7, 6]]
+    A_data = np.random.rand(2**len(A_ixs))
+    #A_data = np.random.randn(2**len(A_ixs))
+    A_data = A_data.reshape(*(v.size for v in A_ixs))
+    A = CompressedTensor('A', A_ixs, data=A_data)
+    A.compress_indices(A_comp)
+    sum_indices = [Var(i) for i in [2, 4]]
+
+    res = compressed_sum(A, sum_indices, NumpyCompressor(), mem_limit=4)
+    print(f"Resulting Tensor: {res}")
+    res_ref = np.sum(A_data, axis=tuple(A_ixs.index(i) for i in sum_indices))
+    assert np.allclose(res.get_chunk((0, )), res_ref[0])
+    assert not np.allclose(res.get_chunk((1, )), res_ref[0])
+
+    res = compressed_sum(res, [Var(5)], NumpyCompressor(), mem_limit=4)
+    assert isinstance(res, Tensor)
+    assert np.allclose(res.data, res_ref.sum(axis=3))
+
+
+if __name__=="__main__":
+    test_compressed_contract()
diff --git a/qtensor/compression/tests/test_compressed_energy_expectation.py b/qtensor/compression/tests/test_compressed_energy_expectation.py
new file mode 100644
index 00000000..895999f9
--- /dev/null
+++ b/qtensor/compression/tests/test_compressed_energy_expectation.py
@@ -0,0 +1,24 @@
+import qtensor
+import numpy as np
+from qtensor.compression import CUSZPCompressor
+import qtensor.tests
+
+def test_compress_energy_expect():
+    G, gamma, beta = qtensor.tests.get_test_problem(n=10, p=2, type='random')
+    edge = list(G.edges())[0]
+    composer = qtensor.QtreeQAOAComposer(G, gamma=gamma, beta=beta)
+    composer.energy_expectation_lightcone(edge)
+    circuit = composer.circuit
+    base_backend = qtensor.contraction_backends.get_backend('cupy')
+    compressor = CUSZPCompressor(r2r_error=1e-4, r2r_threshold=1e-4)
+    backend = qtensor.contraction_backends.CompressionBackend(base_backend, compressor, max_tw=6)
+    sim = qtensor.QtreeSimulator(backend=backend)
+    res = sim.simulate(circuit)
+    sim_exact = qtensor.QtreeSimulator(backend=base_backend)
+    ref = sim_exact.simulate(circuit)
+    print(f'exact: {ref}, compressed: {res}')
+    assert np.allclose(res, ref, atol=1e-4, rtol=0.05)
+
+if __name__ == '__main__':
+    test_energy_expect()
+    print('test passed!')
diff --git a/qtensor/compression/tests/test_compressed_tensor.py b/qtensor/compression/tests/test_compressed_tensor.py
new file mode 100644
index 00000000..dd71f97d
--- /dev/null
+++ b/qtensor/compression/tests/test_compressed_tensor.py
@@ -0,0 +1,96 @@
+from qtensor.compression import CompressedTensor
+from qtensor.compression import (
+    NumpyCompressor,
+    CUSZPCompressor,
+    CUSZXCompressor,
+    TorchCompressor,
+)
+from qtree.optimizer import Var
+from qtree.system_defs import NP_ARRAY_TYPE
+import pytest
+import numpy as np
+
+
+def test_empty_tensor():
+    shape = (2, 3, 4)
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    t = CompressedTensor.empty("myT", indices)
+    assert t.name == "myT"
+    assert t.indices == tuple(indices)
+    assert t.shape == shape
+    assert t.data is not None
+    assert t.data.shape == shape
+    assert t.data.dtype == NP_ARRAY_TYPE
+
+    t.compress_indices([indices[0]])
+    assert t.dtype == NP_ARRAY_TYPE
+
+
+def test_slice_tensor():
+    shape = (2, 3, 4)
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    t = CompressedTensor.empty("myT", indices, dtype=np.uint32)
+    t.compress_indices([indices[0]])
+    S = t[{indices[0]: 1, indices[1]: slice(0, 1)}]
+    assert S.data is not None
+    assert S.data.shape == (1, 4)
+    assert indices[0] not in S.indices
+    assert int(indices[1]) == int(S.indices[0])
+    assert indices[1] != S.indices[0]
+    assert indices[2] in S.indices
+    assert S.indices[1].size == 4
+    assert np.allclose(t.get_chunk([1])[0:1], S.data)
+
+    t = CompressedTensor.empty("myT", indices, dtype=np.uint32)
+    t.compress_indices([indices[0], indices[1]])
+    S = t[1, 2]
+    assert indices[1] not in S.indices
+    assert S.data is not None
+    assert np.allclose(t.get_chunk([1, 2]), S.data)
+
+
+@pytest.mark.parametrize(
+    argnames=["shape", "compressor_cls", "dtype"],
+    argvalues=[
+        ((2, 3, 4), NumpyCompressor, np.float32),
+        ((2, 3, 4), NumpyCompressor, np.float64),
+        #((2,) * 20, TorchCompressor, np.complex64),
+        ((2,) * 20, CUSZXCompressor, np.complex64),
+        ((2,) * 20, CUSZPCompressor, np.complex64),
+
+        # Not supported:
+        # ((2, 3, 4), CUSZXCompressor, np.float32),
+        # ((2, 3, 4), CUSZXCompressor, np.float64),
+        # ((2, 3, 4), CUSZXCompressor, np.complex128),
+        # ((2,)*20, CUSZXCompressor, np.float32),
+        # ((2,)*20, CUSZCompressor(), np.float64)
+    ],
+)
+def test_compressors(shape, compressor_cls, dtype):
+    print(shape, compressor_cls, dtype)
+    compressor = compressor_cls()
+    import cupy
+
+    indices = [Var(i, size=s) for i, s in enumerate(shape)]
+    if dtype is np.complex128:
+        data = cupy.random.random(shape, dtype=np.float64) + 1j * cupy.random.random(
+            shape, dtype=np.float64
+        )
+    elif dtype is np.complex64:
+        data = cupy.random.random(shape, dtype=np.float32) + 1j * cupy.random.random(
+            shape, dtype=np.float32
+        )
+    else:
+        data = cupy.random.random(shape, dtype=dtype)
+    t = CompressedTensor("myT", indices, data=data, compressor=compressor)
+    t.compress_indices([indices[0]])
+    print("<--Compressed")
+
+    s = t[1]
+    print("-->Decompressed")
+    assert s.data is not None
+    ch = cupy.asnumpy(t.get_chunk([1]))
+    ref = cupy.asnumpy(s.data)
+
+    assert np.allclose(ch, ref)
+    assert np.allclose(ch, data[1], rtol=0.15, atol=0.05)
diff --git a/qtensor/compression/tests/test_cost_estimation.py b/qtensor/compression/tests/test_cost_estimation.py
new file mode 100644
index 00000000..0a9aa24e
--- /dev/null
+++ b/qtensor/compression/tests/test_cost_estimation.py
@@ -0,0 +1,75 @@
+import qtensor
+import numpy as np
+from qtensor.compression import compressed_contraction_cost
+from qtensor.tests import get_test_problem
+from qtensor.optimisation import QtreeTensorNet
+from qtensor import QtreeQAOAComposer
+from qtensor.optimisation.Optimizer import TreeTrimSplitter
+
+
+def costs_to_csv(costs):
+    first_line = "flops, memory, width, compressions, decompressions, time"
+    lines = [first_line]
+    for i, c in enumerate(costs):
+        time = c.time(1e11/16, 200e9/16, 200e9/15, 13)
+        lines.append(f"[{i}]\t{c.flops},\t{round(c.memory)},\t{c.width},\t {c.compressions},\t{c.decompressions},\t{time}")
+    return "\n".join(lines)
+
+def test_compressed_contraction_cost():
+    G, gamma, beta = get_test_problem(n=12, p=5, d=4)
+    opt = qtensor.toolbox.get_ordering_algo('naive')
+
+    composer = QtreeQAOAComposer(
+        graph=G, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+
+    tn = QtreeTensorNet.from_qtree_gates(composer.circuit)
+    #max_time = 15
+    peo, t = opt.optimize(tn)
+    print(f"Contraction width: {opt.treewidth}")
+    M_limit = opt.treewidth-6
+    # -- Estimate compressed contraction
+    costs = compressed_contraction_cost(tn, peo, mem_limit=M_limit, compression_ratio=64)
+    cost = sum(costs[2:], costs[0])
+    print(costs_to_csv(costs))
+    # -- Estimate regular contraction
+    mems_lg, flops_lg = tn.simulation_cost(peo)
+    ignored_vars = tn.bra_vars + tn.ket_vars
+    peo = [x for x in peo if x not in ignored_vars]
+    peo = list(map(int, peo))
+    nodes, path = qtensor.utils.get_neighbors_path(tn.get_line_graph(), peo)
+    print("Path\n", path)
+    # -- Estimate sliced contraction
+    opt_par  = qtensor.optimisation.SlicesOptimizer(base_ordering=opt, max_tw=M_limit+1, max_slice=2+opt.treewidth-M_limit)
+    #opt_par  = TreeTrimSplitter(base_ordering=opt, max_tw=M_limit+1, max_slice=5+opt.treewidth-M_limit)
+    peo, par_vars, tn = opt_par.optimize(tn)
+    print("Par vars", par_vars)
+    tn.slice({i: slice(0, 1) for i in par_vars})
+    peo_sl= peo[:-len(par_vars)]
+    costs_sliced = compressed_contraction_cost(tn, peo_sl)
+    cost_sliced = sum(costs_sliced[1:], costs_sliced[0])
+    runs_count = 2**len(par_vars)
+    # print flops and memory from sliced simulation cost
+    flops_run = cost_sliced.flops
+    mem_run = cost_sliced.memory
+    print("M limit", M_limit)
+    print("Cost", cost)
+    print("Cost sliced", cost_sliced)
+    FLOP_perS = 1e12
+    Throughput = 200e9/16
+    print(f'Contraction cost (sliced): {np.log2(flops_run*runs_count*1.)} flops, {np.log2(mem_run*1.)} memory, {cost_sliced.width} width')
+    print(f'Contraction cost (old): {np.log2(sum(flops_lg)*1.)} flops, {np.log2(max(mems_lg))} memory')
+    mems_lg, flops_lg = tn.simulation_cost(peo)
+    print(f'Sliced contraction cost (old): {np.log2(sum(flops_lg)*1.0*runs_count)} flops, {np.log2(max(mems_lg)*1.0)} memory')
+
+    print(f'-- Compressed Contraction time estimate: {cost.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')
+    print(f'-- Sliced contraction time estimate: {runs_count*cost_sliced.time(FLOP_perS, Throughput, Throughput, M_limit)} seconds')
+    print(f'Contraction time (old): {sum(flops_lg)/FLOP_perS} seconds')
+
+
+    print("Path list comp\n", [c.width for c in costs])
+    print("Maxw", max(path))
+    assert opt.treewidth == cost.width+1
+
+if __name__ == '__main__':
+    test_compressed_contraction_cost()
diff --git a/qtensor/compression/tests/test_memory_leak.py b/qtensor/compression/tests/test_memory_leak.py
new file mode 100644
index 00000000..34f327a2
--- /dev/null
+++ b/qtensor/compression/tests/test_memory_leak.py
@@ -0,0 +1,111 @@
+"""
+Run `watch -n 0.1 nvidia-smi` and then run this test
+"""
+from qtensor.compression import CUSZXCompressor
+import cupy
+import numpy as np
+
+
+def _init_nvsmi():
+    import nvidia_smi
+
+    nvidia_smi.nvmlInit()
+    nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+    return nvsmi_handle
+
+
+def _get_nvsmi_mem(handle):
+    import nvidia_smi
+
+    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+    mem = info.used
+    return mem
+
+
+def test_leak_compress():
+    dtype = cupy.complex64
+    dtype_size = dtype(0).nbytes
+    MB_elems = int(1024**2 / dtype_size)
+    MB_target = 128
+    N = MB_target * MB_elems
+    print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
+    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    import qtensor
+
+    c = qtensor.compression.ProfileCompressor(c)
+    _nvsmi_handle = _init_nvsmi()
+
+    a = cupy.zeros(N, dtype=dtype)
+    a[::1024] = 0.01
+    a[::8] = cupy.random.rand(N // 8)
+    for i in range(1000):
+        a[32 * i + 1] = 0.005 * (i % 5 + 1)
+    print(f"Original, [0]={a[0]}, [1024]={a[1024]}")
+
+    for j in range(100):
+        out = c.compress(a)
+        print(i, "Compression ratio", 4 * N / c.compress_size(out))
+        b = c.decompress(out)
+        # a[:] = b
+        print(j, f"Decompressed, [0]={b[0]}, [1024]={b[1024]}")
+        c.free_decompressed()
+        c.free_compressed(out)
+        print(
+            f"== [{j}] Memory usage: {_get_nvsmi_mem(_nvsmi_handle) / 1024 ** 3} GB =="
+        )
+
+
+def test_leak_contract():
+    from qtensor.compression.CompressedTensor import Tensor
+    import qtensor
+    from qtree.optimizer import Var
+    from qtensor.compression.compressed_contraction import compressed_contract
+
+    dtype = cupy.complex64
+    dtype_size = dtype(0).nbytes
+    MB_elems = int(1024**2 / dtype_size)
+    MB_target = 64  # target for largest tensor
+    N = MB_target * MB_elems
+    W_target = int(np.log2(N))
+    print(f"== Testing memory leak with {N} elements and {MB_target} MB array ==")
+    c = CUSZXCompressor(r2r_error=1e-2, r2r_threshold=1e-2)
+    c = qtensor.compression.ProfileCompressor(c)
+    _nvsmi_handle = _init_nvsmi()
+
+    As, Bs = W_target - 4, W_target - 2
+    common_num = int((As + Bs - W_target) / 2)
+    print(f"Common indices: {common_num}, W_target: {W_target}")
+    avars = [Var(i) for i in range(As)]
+    bvars = [Var(i) for i in range(common_num)] + [
+        Var(i) for i in range(As, As + Bs - common_num)
+    ]
+    print("A vars", avars)
+    print("B vars", bvars)
+    TA = Tensor.empty("A", avars)
+    TA.data = np.random.rand(*TA.shape).astype(dtype)
+    TB = Tensor.empty("B", bvars)
+    TB.data = np.random.rand(*TB.shape).astype(dtype)
+
+    _mem_histories = []
+    for j in range(100):
+        res = compressed_contract(
+            TA,
+            TB,
+            avars[:common_num],
+            W_target - 1,
+            c,
+            einsum=cupy.einsum,
+            move_data=cupy.array,
+        )
+        [c.free_compressed(x) for x in res.data]
+        print(f"Result indices: {res.indices}")
+        print(f"Result: {res}")
+        _mem = _get_nvsmi_mem(_nvsmi_handle) / 1024**3
+        print(f"== [{j}] Memory usage: {_mem} GB ==")
+        _mem_histories.append(_mem)
+        print(
+            f"== [{j}] Memory history: {[np.round(x, 2) for x in _mem_histories]} GB =="
+        )
+
+if __name__ == "__main__":
+    test_leak_contract()
diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py
new file mode 100644
index 00000000..bbea4657
--- /dev/null
+++ b/qtensor/compression/torch_quant/torch_quant.py
@@ -0,0 +1,174 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+
+    scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    
+    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.dequantize(q_tensor)
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/compression/torch_quant/torch_quant_perchannel.py b/qtensor/compression/torch_quant/torch_quant_perchannel.py
new file mode 100644
index 00000000..24cf703e
--- /dev/null
+++ b/qtensor/compression/torch_quant/torch_quant_perchannel.py
@@ -0,0 +1,203 @@
+import numpy as np
+import ctypes
+from ctypes import *
+import random
+from qtensor.tools.lazy_import import cupy as cp
+import time
+import torch
+
+from pathlib import Path
+
+BS = 32
+
+def quant_device_compress(oriData, nbEle, blockSize,threshold):
+    #print(nbEle)
+    ori_nbEle = nbEle
+    variable = ctypes.c_size_t(0)
+    outSize = ctypes.pointer(variable)
+
+    oriData = oriData.flatten()
+    ori_real = oriData.real
+    ori_imag = oriData.imag
+    oriData = cp.concatenate((ori_real, ori_imag))
+    sample = oriData[::2]
+    max_val = cp.amax(oriData).get()
+    min_val = cp.amin(oriData).get()
+    d = max_val - min_val
+    if d.dtype == np.complex64:
+        d = d.real
+    threshold = threshold*(d)
+    s_1 = time.time() 
+    truth_values = abs(oriData)<=threshold
+    oriData[truth_values] = 0.0
+    truth_values = cp.invert(truth_values)
+    ori_len = oriData.shape[0]
+    nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0]
+    print("Percent nonzero: "+str(nonzero_percent))
+
+    isGrouped = False
+    if nonzero_percent<=0.5:
+        isGrouped=True
+        oriData = oriData[truth_values]
+    
+    nbEle = oriData.shape[0]
+    
+    # oriData = cp.reshape(oriData, (-1, blockSize))  # Reshape to blocksize
+    tensor = torch.as_tensor(oriData, device='cuda')
+    # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d))
+#    scale = d/255.0
+#    zero_point = -1*round(min_val*scale) - 128
+    if isGrouped:
+        pad_rows = int(nbEle/BS)
+        if nbEle%BS != 0:
+            pad_rows +=1
+
+        padded = torch.zeros(pad_rows*BS, device='cuda')
+        padded[:nbEle] = tensor
+        tensor = padded
+    tensor = torch.reshape(tensor, (-1, BS))
+    maxs = torch.flatten(torch.max(tensor, dim=1)[0])
+    mins = torch.flatten(torch.min(tensor, dim=1)[0])
+    
+    #scales = torch.ones(tensor.shape[0], device='cuda')
+    #scales = torch.mul(scales, d/255.0)
+    #print(d)
+    #print(torch.max(torch.sub(maxs,mins)))
+    scales = torch.abs(torch.sub(maxs,mins))/127.0
+    zero_points = torch.zeros(tensor.shape[0], device='cuda')
+    #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales))
+    #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64
+
+    #print(zero_points)
+
+    #scale = d/((2**8) - 1)
+    #zero_point = -1*round(min_val*scale)
+    #zero_point = -1*round(min_val*scale)+32
+#    q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    #tensor = torch.flatten(tensor)
+    #tensor = torch.split(tensor, BS)
+    #print(maxs)
+    #print(mins)
+    #print(scales)
+    
+    q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8)
+    #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8)
+    del tensor
+    torch.cuda.empty_cache()
+    if isGrouped:
+        bitmap = cp.packbits(truth_values)
+    else:
+        bitmap = None
+    del truth_values
+    #q_ten2 = torch.dequantize(q_tensor)
+    #print(tensor)
+    #print(q_ten2)
+    #print("Max PW error")
+    #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0])))
+    return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8)
+
+
+def quant_device_decompress(nbEle, cmpBytes, owner, dtype):
+    (q_tensor, bitmap, isGrouped) = cmpBytes
+    if isGrouped:
+        bitmap = cp.unpackbits(bitmap)
+    restored = torch.flatten(torch.dequantize(q_tensor))
+    
+    arr = cp.asarray(restored)
+    # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error
+
+    # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff)
+    # -- Workaround to convert GPU pointer to int
+    # p_decompressed_ptr = ctypes.addressof(newData)
+    # cast to int64 pointer
+    # (effectively converting pointer to pointer to addr to pointer to int64)
+    # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64))
+    # decompressed_int = p_decompressed_int.contents
+    # # --
+    # pointer_for_free = decompressed_int.value
+    # # self.decompressed_own.append(decompressed_int.value)
+    # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0)
+    # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0)
+    #print("mem ptr")
+    #print(mem_ptr)
+    # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr)
+    #print(nbEle)
+    if isGrouped:
+        res = cp.zeros((nbEle,))
+    # ## need to convert newData to cupy
+        cp.place(res,bitmap,arr)
+
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+    #c_res.real = arr[0:int(nbEle/2)]
+    #c_res.imag = arr[int(nbEle/2):]
+
+        c_res.real = res[0:int(nbEle/2)]
+        c_res.imag = res[int(nbEle/2):]
+    else:
+        c_res = cp.zeros(int(nbEle/2), np.complex64)
+        c_res.real = arr[0:int(nbEle/2)]
+        c_res.imag = arr[int(nbEle/2):]
+    return (c_res, None)
+
+### Example of device compress/decompress wrapper usage
+class Comp():
+    def __init__(self):
+        self.name = "dummy"
+
+def free_compressed(ptr):
+    p_ptr = ctypes.addressof(ptr)
+    p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64))
+    decomp_int = p_int.contents
+    cp.cuda.runtime.free(decomp_int.value)
+
+
+if __name__ == "__main__":
+    
+    DATA_SIZE = int(1024)
+    MAX_D = 10.0
+    MIN_D = -10.0
+    RANGE = MAX_D - MIN_D
+    r2r_threshold = 0.002
+    r2r_error = 0.0001
+
+    in_vector = np.fromfile("all_sample.bin", dtype=np.complex64)
+    #print(np.max(in_vector))
+    DATA_SIZE = len(in_vector)
+    #range_vr = np.max(in_vector)-np.min(in_vector)
+    #r2r_threshold = r2r_threshold*range_vr
+    #r2r_error = r2r_error*range_vr
+    #in_vector = np.zeros((DATA_SIZE,))
+    #for i in range(0,int(DATA_SIZE/4)):
+    #    in_vector[i] = 0.0
+    #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)):
+    #    in_vector[i] = 5.0
+    #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)):
+    #    in_vector[i] = random.uniform(MIN_D, MAX_D)
+    #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6):
+    #    in_vector[i] = -7.0
+    #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE):
+    #    in_vector[i] = 0.001
+
+    print(DATA_SIZE)
+    #in_vector = in_vector.astype('float32')
+    in_vector_gpu = cp.asarray(in_vector)
+    
+    # variable = ctypes.c_size_t(0)
+    # outSize = ctypes.pointer(variable)
+    for i in range(200):
+        s_time = time.time()
+        o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold)
+        print("Time python: "+str(time.time()-s_time))
+        # print(outSize[0])
+        print("Compress Success...starting decompress ")
+        comp = Comp()
+
+        s_time = time.time()
+        (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype)
+        
+        # free_compressed(o_bytes[0])
+        # cp.cuda.runtime.free(ptr)
+        print("Time python: "+str(time.time()-s_time))
+    #for i in d_bytes:
+    #    print(i)
+        print("Decompress Success")
diff --git a/qtensor/contraction_algos/__init__.py b/qtensor/contraction_algos/__init__.py
new file mode 100644
index 00000000..724cbef2
--- /dev/null
+++ b/qtensor/contraction_algos/__init__.py
@@ -0,0 +1,18 @@
+import qtree
+from qtensor.contraction_backends import ContractionBackend
+
+from .merged_bucket_elimination import bucket_elimination as merged_bucket_elimination
+from .transposed_bucket_elimination import bucket_elimination as transposed_bucket_elimination
+from .transposed_bucket_elimination import is_reverse_order_backend
+
+def bucket_elimination(buckets, backend:ContractionBackend,
+                       n_var_nosum=0):
+    """
+    Algorithm to evaluate a contraction of a large number of tensors.
+    """
+    if is_reverse_order_backend(backend):
+        return transposed_bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
+    else:
+        return qtree.optimizer.bucket_elimination(buckets, backend.process_bucket, n_var_nosum)
+
+
diff --git a/qtensor/merged_indices/bucket_elimination.py b/qtensor/contraction_algos/merged_bucket_elimination.py
similarity index 56%
rename from qtensor/merged_indices/bucket_elimination.py
rename to qtensor/contraction_algos/merged_bucket_elimination.py
index 933dbd65..8b92d54c 100644
--- a/qtensor/merged_indices/bucket_elimination.py
+++ b/qtensor/contraction_algos/merged_bucket_elimination.py
@@ -1,6 +1,19 @@
 import itertools
+import numpy as np
+from qtree.optimizer import Tensor, Var
 
-def bucket_elimination(buckets, ibunch, process_bucket_fn,
+def is_reverse_order_backend(backend):
+    """
+    Duck-test if the tensors are with reverse index order
+    using slice_buckets method
+    """
+    a, b = Var(1), Var(2)
+    test_b = [[Tensor('T', [a, b], data_key='k')]]
+    data_dict={'k': np.random.rand(2, 2)}
+    sliced = backend.get_sliced_buckets(test_b, data_dict, {a: slice(None), b: slice(None)})
+    return sliced[0][0].indices[0] == b
+
+def bucket_elimination(buckets, process_bucket_fn,
                        n_var_nosum=0):
     """
     Algorithm to evaluate a contraction of a large number of tensors.
@@ -12,9 +25,8 @@ def bucket_elimination(buckets, ibunch, process_bucket_fn,
     Parameters
     ----------
     buckets : list of lists
-    ibunch : list of lists of indices to contract.
     process_bucket_fn : function
-    function that will process buckets, takes list of indices to contract + buckets
+              function that will process this kind of buckets
     n_var_nosum : int, optional
               number of variables that have to be left in the
               result. Expected at the end of bucket list
@@ -22,35 +34,37 @@ def bucket_elimination(buckets, ibunch, process_bucket_fn,
     -------
     result : numpy.array
     """
+    # import pdb
+    # pdb.set_trace()
     n_var_contract = len(buckets) - n_var_nosum
-    assert len(ibunch) == len(buckets), "Buckets length should be same as ibunch length"
 
     result = None
-    for ixs, bucket in zip(ibunch, buckets[:n_var_contract]):
+    for n in range(n_var_contract):
+        bucket = buckets[n]
         if len(bucket) > 0:
-            tensor = process_bucket_fn(ixs, bucket)
+            tensor = process_bucket_fn(bucket)
+            #-- Memory management
+            buckets[n] = []
+            #--
+
             if len(tensor.indices) > 0:
                 # tensor is not scalar.
                 # Move it to appropriate bucket
-                smallest_ix = min([int(x) for x in tensor.indices])
-                appended = False
-                for j, ixs in enumerate(ibunch):
-                    if smallest_ix in map(int, ixs):
-                        buckets[j].append(tensor)
-                        appended = True
-                if not appended:
-                    raise Exception('Algorithmic error, investigate.')
+                first_index = int(tensor.indices[-1])
+                buckets[first_index].append(tensor)
             else:   # tensor is scalar
                 if result is not None:
                     result *= tensor
                 else:
                     result = tensor
+        # free up space, the tensors are no longer needed
+        buckets[n] = []
 
     # form a single list of the rest if any
     rest = list(itertools.chain.from_iterable(buckets[n_var_contract:]))
     if len(rest) > 0:
         # only multiply tensors
-        tensor = process_bucket_fn([], rest, no_sum=True)
+        tensor = process_bucket_fn(rest, no_sum=True)
         if result is not None:
             result *= tensor
         else:
diff --git a/qtensor/contraction_algos/transposed_bucket_elimination.py b/qtensor/contraction_algos/transposed_bucket_elimination.py
new file mode 100644
index 00000000..8b92d54c
--- /dev/null
+++ b/qtensor/contraction_algos/transposed_bucket_elimination.py
@@ -0,0 +1,73 @@
+import itertools
+import numpy as np
+from qtree.optimizer import Tensor, Var
+
+def is_reverse_order_backend(backend):
+    """
+    Duck-test if the tensors are with reverse index order
+    using slice_buckets method
+    """
+    a, b = Var(1), Var(2)
+    test_b = [[Tensor('T', [a, b], data_key='k')]]
+    data_dict={'k': np.random.rand(2, 2)}
+    sliced = backend.get_sliced_buckets(test_b, data_dict, {a: slice(None), b: slice(None)})
+    return sliced[0][0].indices[0] == b
+
+def bucket_elimination(buckets, process_bucket_fn,
+                       n_var_nosum=0):
+    """
+    Algorithm to evaluate a contraction of a large number of tensors.
+    The variables to contract over are assigned ``buckets`` which
+    hold tensors having respective variables. The algorithm
+    proceeds through contracting one variable at a time, thus we eliminate
+    buckets one by one.
+
+    Parameters
+    ----------
+    buckets : list of lists
+    process_bucket_fn : function
+              function that will process this kind of buckets
+    n_var_nosum : int, optional
+              number of variables that have to be left in the
+              result. Expected at the end of bucket list
+    Returns
+    -------
+    result : numpy.array
+    """
+    # import pdb
+    # pdb.set_trace()
+    n_var_contract = len(buckets) - n_var_nosum
+
+    result = None
+    for n in range(n_var_contract):
+        bucket = buckets[n]
+        if len(bucket) > 0:
+            tensor = process_bucket_fn(bucket)
+            #-- Memory management
+            buckets[n] = []
+            #--
+
+            if len(tensor.indices) > 0:
+                # tensor is not scalar.
+                # Move it to appropriate bucket
+                first_index = int(tensor.indices[-1])
+                buckets[first_index].append(tensor)
+            else:   # tensor is scalar
+                if result is not None:
+                    result *= tensor
+                else:
+                    result = tensor
+        # free up space, the tensors are no longer needed
+        buckets[n] = []
+
+    # form a single list of the rest if any
+    rest = list(itertools.chain.from_iterable(buckets[n_var_contract:]))
+    if len(rest) > 0:
+        # only multiply tensors
+        tensor = process_bucket_fn(rest, no_sum=True)
+        if result is not None:
+            result *= tensor
+        else:
+            result = tensor
+    return result
+
diff --git a/qtensor/contraction_backends/__init__.py b/qtensor/contraction_backends/__init__.py
index 6d57b88c..adaa68d1 100644
--- a/qtensor/contraction_backends/__init__.py
+++ b/qtensor/contraction_backends/__init__.py
@@ -1,5 +1,6 @@
 #from torch._C import device
 from .base_class import ContractionBackend
+from .common import slice_numpy_tensor
 from .numpy import NumpyBackend
 from .torch import TorchBackend
 from .cupy import CuPyBackend
@@ -10,11 +11,14 @@
 from .opt_einsum import OptEinusmBackend
 from .transpose_backend import NumpyTranspoedBackend, TorchTransposedBackend, CupyTransposedBackend, CutensorTransposedBackend
 from .performance_measurement_decorator import PerfNumpyBackend, PerfBackend, GPUPerfBackend
+from .compression import CompressionBackend
+from qtensor.compression import NumpyCompressor
 
 def get_backend(name):
     backend_dict = {
         'mkl': CMKLExtendedBackend,
         'einsum':NumpyBackend,
+        'numpy':NumpyBackend,
         'opt_einsum': OptEinusmBackend,
         'torch_cpu': TorchBackend,
         'torch_gpu': TorchBackend,
@@ -26,7 +30,14 @@ def get_backend(name):
         'tr_cupy': CupyTransposedBackend,
         'tr_cutensor': CutensorTransposedBackend
     }
-    if name in ["torch_gpu", "tr_torch"]:
+    # -- add compression backend
+    compression_suffix = '_compressed'
+    ix = name.find(compression_suffix)
+    if ix != -1:
+        backend = get_backend(name[:ix])
+        return CompressionBackend(backend, NumpyCompressor(), 30)
+
+    if name in ["torch_gpu", "torch_cpu"]:
         return backend_dict['torch'](device = name[-3:])
     else:
         return backend_dict[name]()
diff --git a/qtensor/contraction_backends/common.py b/qtensor/contraction_backends/common.py
new file mode 100644
index 00000000..e3c0fbc5
--- /dev/null
+++ b/qtensor/contraction_backends/common.py
@@ -0,0 +1,84 @@
+import numpy as np
+import qtree
+from qtree.optimizer import Tensor
+
+def permute_np_tensor_data(data:np.ndarray, indices_in, indices_out):
+    """
+    Permute the data of a numpy tensor to the given indices_out.
+    
+    Returns:
+        permuted data
+    """
+    # permute indices
+    out_locs = {idx: i for i, idx in enumerate(indices_out)}
+    perm = [out_locs[i] for i in indices_in]
+    # permute tensor
+    return np.transpose(data, perm)
+
+def get_slice_bounds(slice_dict, indices):
+    """Slice a numpy tensor data
+
+
+    Returns:
+        tuple of slice bounds
+    """
+    slice_bounds = tuple([
+        slice_dict.get(i, slice(None)) for i in indices
+    ])
+    return slice_bounds
+
+def slice_numpy_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
+    """
+    Args:
+        data : np.ndarray
+        indices_in: list of `qtree.optimizer.Var`
+        indices_out: list of `qtree.optimizer.Var`
+        slice_dict: dict of `qtree.optimizer.Var` to `slice`
+
+    Returns:
+        new data, new indices
+    """
+    slice_bounds = get_slice_bounds(slice_dict, indices_in)
+    s_data = data[slice_bounds]
+    indices_sliced = [
+        i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
+    ]
+    indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
+    #print("indices_sized", indices_sized)
+    #print("Slice bounds", slice_bounds)
+    #print("Slice dict", slice_dict)
+    #print("data shape, sliced data shape", data.shape, s_data.shape)
+    indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)]
+    assert len(indices_sliced) == len(s_data.shape)
+    st_data = permute_np_tensor_data(s_data, indices_sliced, indices_out)
+    return st_data, indices_out
+
+def get_einsum_expr(idx1, idx2, contract=0):
+    """
+    Takes two tuples of indices and returns an einsum expression
+    to evaluate the sum over repeating indices
+
+    Parameters
+    ----------
+    idx1 : list-like
+          indices of the first argument
+    idx2 : list-like
+          indices of the second argument
+
+    Returns
+    -------
+    expr : str
+          Einsum command to sum over indices repeating in idx1
+          and idx2.
+    """
+    result_indices = sorted(list(set(idx1 + idx2)), reverse=True)
+    # remap indices to reduce their order, as einsum does not like
+    # large numbers
+    idx_to_least_idx = {old_idx: new_idx for new_idx, old_idx
+                        in enumerate(result_indices)}
+    result_indices = result_indices[:len(result_indices)-contract]
+
+    str1 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx1)
+    str2 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in idx2)
+    str3 = ''.join(qtree.utils.num_to_alpha(idx_to_least_idx[ii]) for ii in result_indices)
+    return str1 + ',' + str2 + '->' + str3
diff --git a/qtensor/contraction_backends/compression.py b/qtensor/contraction_backends/compression.py
new file mode 100644
index 00000000..03059371
--- /dev/null
+++ b/qtensor/contraction_backends/compression.py
@@ -0,0 +1,101 @@
+from qtensor.contraction_backends import ContractionBackend
+from qtensor.compression import Compressor, CompressedTensor, Tensor
+from qtensor.compression.compressed_contraction import compressed_contract, compressed_sum
+from qtensor.contraction_backends.common import slice_numpy_tensor
+from qtree.optimizer import Tensor
+
+class CompressionBackend(ContractionBackend):
+    """
+    Compression bucket contraction backend.
+
+    This backend "decorates" another backend, by using compression in 
+    pairwise contraction. If the result tensor has more than `max_tw` indices,
+    it is sliced and the contraction result is compressed before proceeding to
+    next slice.
+    """
+    def __init__(self, backend, compressor:Compressor, max_tw:int):
+        """
+        Arguments:
+            backend: the backend to use for contraction
+            compressor: the compressor to use for compression
+            max_tw: threshold for triggering compression.
+
+        """
+        self.backend = backend
+        self.compressor = compressor
+        self.max_tw = max_tw
+
+    def _get_backend_specific_fns(self, backend):
+        ## Hacky way to extend backends
+        if 'cupy' in backend.__class__.__name__.lower():
+            import cupy as cp
+            return cp.einsum, cp.array
+        elif 'torch' in backend.__class__.__name__.lower():
+            import torch
+            return torch.einsum, torch.tensor
+        else:
+            import numpy as np
+            return np.einsum, lambda x: x
+
+    def process_bucket(self, bucket, no_sum=False):
+        """
+        Process a bucket.
+
+        This uses `self.backend.process_bucket` in combination with
+        compression.compressed_contraction.compressed_contract
+        """
+        ctr_kw = dict(zip(['einsum', 'move_data'], self._get_backend_specific_fns(self.backend)))
+        bucket.sort(key=lambda x: len(x.indices))
+        #print("Processing bucket", bucket)
+        accum = bucket[0]
+        for t in bucket[1:-1]:
+            accum = compressed_contract(
+                accum, t, [], self.max_tw, self.compressor,
+                **ctr_kw
+            )
+        if len(bucket)>1:
+            t = bucket[-1]
+            total_ixs = sorted(
+                set().union(*[t.indices, accum.indices])
+                , key=int, reverse=True
+            )
+            accum_new = compressed_contract(
+                accum, t, [total_ixs[-1]], self.max_tw, self.compressor
+                ,**ctr_kw
+            )
+            # free data
+            import cupy
+            for t in [accum, t]:
+                if isinstance(t, CompressedTensor):
+                    t.compressor.free_decompressed()
+                    
+            accum = accum_new
+
+            return accum
+        else:
+            if len(accum.indices) < 1:
+                return accum
+            indices = (accum.indices[-1], )
+            res = compressed_sum(accum, indices, self.compressor, self.max_tw,  **ctr_kw)
+            if isinstance(accum, CompressedTensor):
+                accum.compressor.free_decompressed()
+            return res
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        """
+        Slice buckets accounding to `slice_dict`
+
+        This delegates to `self.backend`, assuming that buckets don't have 
+        tensors with more than `self.max_tw` indices.
+        """
+        # Note: to support large tensors (more than `max_tw`), 
+        # just iterate through sliced bucket tensors and compress if needed
+        return self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
+
+    def get_result_data(self, result):
+        """
+        Get result data from `result` tensor.
+
+        This assumes that the result has at most `self.max_tw` indices.
+        """
+        return self.backend.get_result_data(result)
diff --git a/qtensor/contraction_backends/cupy.py b/qtensor/contraction_backends/cupy.py
index b5a897d9..ee4e4b6c 100644
--- a/qtensor/contraction_backends/cupy.py
+++ b/qtensor/contraction_backends/cupy.py
@@ -1,7 +1,8 @@
 import qtree
 from qtensor.tools.lazy_import import cupy as cp
 from qtensor.contraction_backends import ContractionBackend
-from qtensor.contraction_backends.numpy import get_einsum_expr
+#from qtensor.contraction_backends.numpy import get_einsum_expr
+from .common import slice_numpy_tensor, get_einsum_expr
 
 
 class CuPyBackend(ContractionBackend):
@@ -9,11 +10,12 @@ class CuPyBackend(ContractionBackend):
     # Replace all torch methods with cupy's analog
     
     def process_bucket(self, bucket, no_sum=False):
+        bucket.sort(key = lambda x: len(x.indices))
         result_indices = bucket[0].indices
         result_data = bucket[0].data
         for tensor in bucket[1:]:
 
-            expr = qtree.utils.get_einsum_expr(
+            expr = get_einsum_expr(
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
             
@@ -25,15 +27,16 @@ def process_bucket(self, bucket, no_sum=False):
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
-                key=int)
+                key=int, reverse=True)
             )
 
         if len(result_indices) > 0:
             if not no_sum:  # trim first index
-                first_index, *result_indices = result_indices
+                contract_index = result_indices[-1]
+                result_indices = result_indices[:-1]
             else:
-                first_index, *_ = result_indices
-            tag = first_index.identity
+                contract_index = result_indices[-1]
+            tag = contract_index.identity
         else:
             tag = 'f'
             result_indices = []
@@ -44,7 +47,7 @@ def process_bucket(self, bucket, no_sum=False):
                                 data=result_data)
         else:
             result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=cp.sum(result_data, axis=0))
+                                data=cp.sum(result_data, axis=-1))
         return result
 
     def process_bucket_merged(self, ixs, bucket, no_sum=False):
@@ -95,45 +98,20 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
                 # transpose_order = np.argsort(list(map(int, tensor.indices)))
                 # cp.argsort requires input to be cp array
                 #print(tensor.indices)
-                transpose_order = cp.argsort(cp.asarray(list(map(int, tensor.indices)))).tolist()
-                
-                '''
-                Change 2:
-                Original: Data is all converted into torch.tensor and use torch api, the results are in torch
-                New:      Convert all data to CuPy.ndarray, will raise exceptional signal
-                '''
+                out_indices = list(sorted(tensor.indices, key=int, reverse=True))
                 data = data_dict[tensor.data_key]
+                data, new_indices = slice_numpy_tensor(data, tensor.indices, out_indices, slice_dict)
+                # transpose indices
                 try:
-                    data = cp.asarray(data)
-                    data = data.transpose(tuple(transpose_order))
+                    data = cp.asarray(data, dtype=cp.complex64)
                 except:
                     print("CuPy Backend doesn't support gradient.")
-                
-                # transpose indices
-                indices_sorted = [tensor.indices[pp]
-                                  for pp in transpose_order]
-
-                # slice data
-                slice_bounds = []
-                for idx in indices_sorted:
-                    try:
-                        slice_bounds.append(slice_dict[idx])
-                    except KeyError:
-                        slice_bounds.append(slice(None))
-
-                data = data[tuple(slice_bounds)]
-
-                # update indices
-                indices_sliced = [idx.copy(size=size) for idx, size in
-                                  zip(indices_sorted, data.shape)]
-                indices_sliced = [i for sl, i in zip(slice_bounds, indices_sliced) if not isinstance(sl, int)]
-                assert len(data.shape) == len(indices_sliced)
 
                 sliced_bucket.append(
-                    tensor.copy(indices=indices_sliced, data=data))
+                    tensor.copy(indices=new_indices, data=data))
             sliced_buckets.append(sliced_bucket)
 
         return sliced_buckets
 
     def get_result_data(self, result):
-        return result.data
+        return cp.transpose(result.data)
diff --git a/qtensor/contraction_backends/performance_measurement_decorator.py b/qtensor/contraction_backends/performance_measurement_decorator.py
index eb676cb9..c2ae5bc3 100644
--- a/qtensor/contraction_backends/performance_measurement_decorator.py
+++ b/qtensor/contraction_backends/performance_measurement_decorator.py
@@ -1,10 +1,133 @@
 import numpy as np
 from dataclasses import dataclass
 from qtensor.contraction_backends import ContractionBackend, NumpyBackend
+from qtensor.contraction_backends.compression import CompressionBackend, CompressedTensor
 from pyrofiler import timing
 from qtensor.tools.lazy_import import torch, pandas
 import string
 
+# -- memory profiling
+from weakref import WeakValueDictionary
+
+class MemProfBackend(ContractionBackend):
+    def __init__(self, backend=NumpyBackend(), print=True):
+        self.backend = backend
+        self.object_store = WeakValueDictionary()
+        self.object_keys = []
+        self.print = print
+        self.mem_history = []
+
+        import nvidia_smi
+        nvidia_smi.nvmlInit()
+        self.nvsmi_handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
+
+    def _print(self, *args, **kwargs):
+        if self.print:
+            print(*args, **kwargs)
+
+    def _get_nvsmi_mem(self):
+        import nvidia_smi
+        info = nvidia_smi.nvmlDeviceGetMemoryInfo(self.nvsmi_handle)
+        mem = info.used
+        return mem
+
+    @property
+    def max_mem(self):
+        mems = [m['mem'] for m in self.mem_history]
+        return max(mems)
+
+    @property
+    def nvsmi_max_mem(self):
+        mems = [m['nvmem'] for m in self.mem_history]
+        return max(mems)
+    @property
+    def cupy_buffer_max_mem(self):
+        mems = [m['cupy_bufsize'] for m in self.mem_history]
+        return max(mems)
+    @property
+    def object_max_mem(self):
+        mems = [m['objmem'] for m in self.mem_history]
+        return max(mems)
+
+    def check_store(self):
+        import cupy
+        mempool = cupy.get_default_memory_pool()
+        total_mem = 0
+        deleted_keys = []
+        for key in self.object_keys:
+            tensor = self.object_store.get(key, None)
+            if tensor is None:
+                #self._print("Tensor", key, "was deleted")
+                deleted_keys.append(key)
+                continue
+            else:
+                size = self.tensor_size(tensor)
+                if isinstance(tensor, CompressedTensor):
+                    print("Tensor", tensor, "size", size)
+                total_mem += size
+        for key in deleted_keys:
+            self.object_keys.remove(key)
+
+        if total_mem>1024**2:
+            self._print("Total memory usage", total_mem/1024/1024, "MB")
+            mempool.free_all_blocks()
+        cupy_mem = mempool.used_bytes()
+        # get maximum memory usage
+        gpu_mem = cupy_mem
+        if isinstance(self.backend, CompressionBackend):
+            gpu_mem += 8*2**self.backend.max_tw
+        self.mem_history.append(dict(
+            mem=total_mem,
+            cupy_bufsize=mempool.total_bytes(),
+            nvmem = self._get_nvsmi_mem(),
+            cupybuf=mempool.total_bytes(),
+            objmem=total_mem,
+            tensors_sizes=[len(tensor.indices) for tensor in self.object_store.values()]
+        ))
+        # --
+        #print('MH', self.mem_history[-1])
+        if cupy_mem>1024**2:
+            self._print("CuPy memory usage", cupy_mem/1024/1024, "MB. Total MB:", mempool.total_bytes()/1024**2)
+
+    def tensor_size(self, tensor)->int:
+        from qtensor.compression import Tensor, CompressedTensor
+        if tensor.data is None:
+            return 0
+        if isinstance(tensor, CompressedTensor):
+            chunks = tensor._data
+            sizes = [tensor.compressor.compress_size(x) for x in chunks]
+            return sum(sizes)
+        elif isinstance(tensor, Tensor):
+            return tensor.data.nbytes
+        else:
+            raise ValueError("Unknown tensor type")
+
+    def add_tensor(self, tensor):
+        label = str(tensor)
+        self.object_store[label] = tensor
+        self.object_keys.append(label)
+        tsize = self.tensor_size(tensor)
+        if tsize>1024:
+            self._print("Added tensor with data size", tsize/1024, "KB")
+        self.check_store()
+
+    def process_bucket(self, bucket, no_sum=False):
+        res = self.backend.process_bucket(bucket, no_sum=no_sum)
+        self.add_tensor(res)
+        return res
+
+    def get_sliced_buckets(self, buckets, data_dict, slice_dict):
+        buckets = self.backend.get_sliced_buckets(buckets, data_dict, slice_dict)
+        for bucket in buckets:
+            for tensor in bucket:
+                self.add_tensor(tensor)
+        return buckets
+
+    def get_result_data(self, result):
+        return self.backend.get_result_data(result)
+
+# --
+
 @dataclass
 class BucketContnractionStats:
     """
@@ -39,7 +162,7 @@ def from_bucket_time(cls, bucket: list, time: float):
     def indices_info(self):
         """ String representation of bucket data"""
         info = ""
-        all_indices = sorted(sum(map(list, self.indices), []), key=int)
+        all_indices = sorted(list(set(sum(map(list, self.indices), []))), key=int)
         ix_to_char = {i:string.ascii_letters[j] for j, i in enumerate(all_indices)}
         for ix, strides in zip(self.indices, self.strides):
             tensor_info = ""
diff --git a/qtensor/contraction_backends/tests/test_common.py b/qtensor/contraction_backends/tests/test_common.py
new file mode 100644
index 00000000..c5905932
--- /dev/null
+++ b/qtensor/contraction_backends/tests/test_common.py
@@ -0,0 +1,48 @@
+from qtensor.contraction_backends.common import slice_numpy_tensor
+import numpy as np
+from qtree.optimizer import Var
+
+def test_slice_numpy_tensor():
+    shape = (2, 3, 4, 5)
+    indices_in = [Var(i, size=s) for i, s in enumerate(shape)]
+    data = np.random.rand(*shape)
+    data_ref = data.copy()
+    slice_dict = {
+        indices_in[0]: slice(None),
+        indices_in[1]: slice(1, 3),
+        indices_in[2]: 1,
+        indices_in[3]: slice(3, 4),
+    }
+    indices_out = [indices_in[3], indices_in[1], indices_in[0]]
+    new_data, new_indices = slice_numpy_tensor(
+        data, indices_in, indices_out, slice_dict
+    )
+    assert new_data.shape == (1, 2, 2)
+    assert new_indices == indices_out
+    assert np.allclose(data,  data_ref)
+    assert not np.allclose(new_data , data_ref[:, 1:3, 1, 3:4])
+    assert np.allclose(new_data , data_ref[:, 1:3, 1, 3:4].transpose(2, 1, 0))
+    assert np.allclose(new_data , data_ref.transpose()[3:4, 1, 1:3, :])
+
+def test_slice_torch_tensor():
+    import torch
+    shape = (2, 3, 4, 5)
+    indices_in = [Var(i, size=s) for i, s in enumerate(shape)]
+    data = torch.randn(*shape)
+    data_ref = data.clone()
+    slice_dict = {
+        indices_in[0]: slice(None),
+        indices_in[1]: slice(1, 3),
+        indices_in[2]: 1,
+        indices_in[3]: slice(3, 4),
+    }
+    indices_out = [indices_in[3], indices_in[1], indices_in[0]]
+    new_data, new_indices = slice_numpy_tensor(
+        data, indices_in, indices_out, slice_dict
+    )
+    assert isinstance(new_data, torch.Tensor)
+    assert new_data.shape == (1, 2, 2)
+    assert new_indices == indices_out
+    assert np.allclose(data,  data_ref)
+    assert not np.allclose(new_data , data_ref[:, 1:3, 1, 3:4])
+    assert np.allclose(new_data , data_ref[:, 1:3, 1, 3:4].permute(2, 1, 0))
diff --git a/qtensor/contraction_backends/tests/test_cupy.py b/qtensor/contraction_backends/tests/test_cupy.py
index 517fc074..a559ae94 100644
--- a/qtensor/contraction_backends/tests/test_cupy.py
+++ b/qtensor/contraction_backends/tests/test_cupy.py
@@ -64,7 +64,7 @@ def contract_tn(backend, search_len=1, test_problem_kwargs={}):
         print('selected_bucket', selected_bucket)
 
         result = backend.process_bucket(selected_bucket)
-        return result.data
+        return backend.get_result_data(result)
 
     restr = contract_tn(btr, 1)
     resnp = contract_tn(bnp, 1)
diff --git a/qtensor/contraction_backends/tests/test_torch.py b/qtensor/contraction_backends/tests/test_torch.py
index df47d9d0..32a4757d 100644
--- a/qtensor/contraction_backends/tests/test_torch.py
+++ b/qtensor/contraction_backends/tests/test_torch.py
@@ -3,19 +3,9 @@
 import numpy as np
 from qtensor.contraction_backends import TorchBackend, NumpyBackend
 from qtensor import QtreeSimulator
+from qtensor.tests import get_test_qaoa_ansatz_circ
 torch = pytest.importorskip('torch')
 
-def get_test_qaoa_circ(n=10, p=2, d=3, type='random'):
-    G = qtensor.toolbox.random_graph(seed=10, degree=d, nodes=n, type=type)
-    print('Test problem: n, p, d', n, p, d)
-    gamma, beta = [np.pi/5]*p, [np.pi/2]*p
-
-    composer = qtensor.DefaultQAOAComposer(
-        graph=G, gamma=gamma, beta=beta)
-    composer.ansatz_state()
-    return composer.circuit
-
-
 def get_test_qaoa_tn(n=10, p=2, d=3, type='random'):
     G = qtensor.toolbox.random_graph(seed=10, degree=d, nodes=n, type=type)
     print('Test problem: n, p, d', n, p, d)
@@ -29,7 +19,7 @@ def get_test_qaoa_tn(n=10, p=2, d=3, type='random'):
 
 
 def test_simulation():
-    circ = get_test_qaoa_circ(p=3)
+    circ = get_test_qaoa_ansatz_circ(p=3)
     btr = TorchBackend()
     bnp = NumpyBackend()
     simtr = QtreeSimulator(backend=btr)
@@ -56,13 +46,12 @@ def contract_tn(backend, search_len=1, test_problem_kwargs={}):
         print('selected_bucket', selected_bucket)
 
         result = backend.process_bucket(selected_bucket)
-        return result.data
+        return backend.get_result_data(result)
 
     # First test only simple buckets
     restr = contract_tn(btr, 1)
     resnp = contract_tn(bnp, 1)
     assert type(restr) is torch.Tensor
-    assert restr.dtype is torch.cfloat
 
     assert np.allclose(restr, resnp)
 
diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py
index b3614fa3..3df1bf16 100644
--- a/qtensor/contraction_backends/torch.py
+++ b/qtensor/contraction_backends/torch.py
@@ -3,7 +3,10 @@
 import numpy as np
 from qtree import np_framework
 from qtensor.contraction_backends import ContractionBackend
-from qtensor.contraction_backends.numpy import get_einsum_expr
+from .common import get_slice_bounds, get_einsum_expr, slice_numpy_tensor
+import string
+CHARS = string.ascii_lowercase + string.ascii_uppercase
+
 def qtree2torch_tensor(tensor, data_dict):
     """ Converts qtree tensor to pytorch tensor using data dict"""
     if isinstance(tensor.data, torch.Tensor):
@@ -16,6 +19,57 @@ def qtree2torch_tensor(tensor, data_dict):
     data_dict[tensor.data_key] = torch_t
     return tensor.copy(data=torch_t)
 
+def get_einsum_expr_bucket(bucket, all_indices_list, result_indices):
+    # converting elements to int will make stuff faster, 
+    # but will drop support for char indices
+    # all_indices_list = [int(x) for x in all_indices]
+    # to_small_int = lambda x: all_indices_list.index(int(x))
+    to_small_int = lambda x: all_indices_list.index(x)
+    expr = ','.join(
+        ''.join(CHARS[to_small_int(i)] for i in t.indices)
+        for t in bucket) +\
+            '->'+''.join(CHARS[to_small_int(i)] for i in result_indices)
+    return expr
+
+
+
+
+def permute_torch_tensor_data(data:np.ndarray, indices_in, indices_out):
+    """
+    Permute the data of a numpy tensor to the given indices_out.
+    
+    Returns:
+        permuted data
+    """
+    # permute indices
+    out_locs = {idx: i for i, idx in enumerate(indices_out)}
+    perm = [out_locs[i] for i in indices_in]
+    # permute tensor
+    return torch.permute(data, perm)
+
+def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict):
+    """
+    Args:
+        data : np.ndarray
+        indices_in: list of `qtree.optimizer.Var`
+        indices_out: list of `qtree.optimizer.Var`
+        slice_dict: dict of `qtree.optimizer.Var` to `slice`
+
+    Returns:
+        new data, new indices
+    """
+    slice_bounds = get_slice_bounds(slice_dict, indices_in)
+    s_data = data[slice_bounds]
+    indices_sliced = [
+        i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int)
+    ]
+    indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)]
+    indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)]
+    assert len(indices_sized) == len(s_data.shape)
+    assert len(indices_sliced) == len(s_data.shape)
+    st_data = permute_torch_tensor_data(s_data, indices_sliced, indices_out)
+    return st_data, indices_out
+
 
 class TorchBackend(ContractionBackend):
     def __init__(self, device='cpu'):
@@ -23,32 +77,26 @@ def __init__(self, device='cpu'):
         self.dtype = ['float', 'double', 'complex64', 'complex128']
         self.width_dict = [set() for i in range(30)]
         self.width_bc = [[0,0] for i in range(30)] #(#distinct_bc, #bc)
-        self.exprs = {}
-
 
     def process_bucket(self, bucket, no_sum=False):
+        bucket.sort(key = lambda x: len(x.indices))
         result_indices = bucket[0].indices
         result_data = bucket[0].data
         width = len(set(bucket[0].indices))
-        #print("w:",width)
 
-        for tensor in bucket[1:]:
+        for tensor in bucket[1:-1]:
 
-            expr = qtree.utils.get_einsum_expr(
+            expr = get_einsum_expr(
                 list(map(int, result_indices)), list(map(int, tensor.indices))
             )
 
-            if expr not in self.exprs.keys():
-                self.exprs[expr] = 1
-            else:
-                self.exprs[expr] += 1
-
             result_data = torch.einsum(expr, result_data, tensor.data)
 
             # Merge and sort indices and shapes
             result_indices = tuple(sorted(
                 set(result_indices + tensor.indices),
-                key=int)
+                key=int, reverse=True
+            )
             )
             
             size = len(set(tensor.indices))
@@ -59,26 +107,33 @@ def process_bucket(self, bucket, no_sum=False):
             self.width_bc[width][0] = len(self.width_dict[width])
             self.width_bc[width][1] += 1
 
+        if len(bucket)>1:
+            tensor = bucket[-1]
+            expr = get_einsum_expr(
+                list(map(int, result_indices)), list(map(int, tensor.indices))
+                , contract = 1
+            )
+            result_data = torch.einsum(expr, result_data, tensor.data)
+            result_indices = tuple(sorted(
+                set(result_indices + tensor.indices),
+                key=int, reverse=True
+            ))
+        else:
+            result_data = result_data.sum(axis=-1)
+
+
+
         if len(result_indices) > 0:
-            if not no_sum:  # trim first index
-                first_index, *result_indices = result_indices
-            else:
-                first_index, *_ = result_indices
+            first_index = result_indices[-1]
+            result_indices = result_indices[:-1]
             tag = first_index.identity
         else:
             tag = 'f'
             result_indices = []
 
         # reduce
-        if no_sum:
-            result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=result_data)
-        else:
-            result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
-                                data=torch.sum(result_data, axis=0))
-        
-        #print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
-        #print("stats:",self.width_bc)
+        result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
+                            data=result_data)
         return result
 
     def process_bucket_merged(self, ixs, bucket, no_sum=False):
@@ -103,13 +158,7 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False):
             for i in range(len(tensors)):
                 tensors[i] = tensors[i].type(torch.complex128)
         
-        expr = get_einsum_expr(bucket, all_indices_list, result_indices)
-        # print("expr:", expr)
-        if expr not in self.exprs.keys():
-            self.exprs[expr] = 1
-        else:
-            self.exprs[expr] += 1
-
+        expr = get_einsum_expr_bucket(bucket, all_indices_list, result_indices)
         expect = len(result_indices)
         result_data = torch.einsum(expr, *tensors)
 
@@ -122,8 +171,6 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False):
         result = qtree.optimizer.Tensor(f'E{tag}', result_indices,
                             data=result_data)
         
-        # print("summary:",sorted(self.exprs.items(), key=lambda x: x[1], reverse=True))
-        # print("# distinct buckets:", len(self.exprs))
         return result
 
     def get_sliced_buckets(self, buckets, data_dict, slice_dict):
@@ -133,41 +180,25 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict):
             for tensor in bucket:
                 # get data
                 # sort tensor dimensions
-                transpose_order = np.argsort(list(map(int, tensor.indices)))
+                out_indices = list(sorted(tensor.indices, key=int, reverse=True))
                 data = data_dict[tensor.data_key]
+                # Works for torch tensors just fine
                 if not isinstance(data, torch.Tensor):             
                     if self.device == 'gpu' and torch.cuda.is_available():
                         cuda = torch.device('cuda')
-                        data = torch.from_numpy(data).to(cuda)
+                        data = torch.from_numpy(data.astype(np.complex128)).to(cuda)
                     else:
-                        data = torch.from_numpy(data)
-
-                data = data.permute(tuple(transpose_order))
-                # transpose indices
-                indices_sorted = [tensor.indices[pp]
-                                  for pp in transpose_order]
-
+                        data = torch.from_numpy(data.astype(np.complex128))
+                else:
+                    data = data.type(torch.complex128)
                 # slice data
-                slice_bounds = []
-                for idx in indices_sorted:
-                    try:
-                        slice_bounds.append(slice_dict[idx])
-                    except KeyError:
-                        slice_bounds.append(slice(None))
-
-                data = data[tuple(slice_bounds)]
-
-                # update indices
-                indices_sliced = [idx.copy(size=size) for idx, size in
-                                  zip(indices_sorted, data.shape)]
-                indices_sliced = [i for sl, i in zip(slice_bounds, indices_sliced) if not isinstance(sl, int)]
-                assert len(data.shape) == len(indices_sliced)
+                data, new_indices = slice_torch_tensor(data, tensor.indices, out_indices, slice_dict)
 
                 sliced_bucket.append(
-                    tensor.copy(indices=indices_sliced, data=data))
+                    tensor.copy(indices=new_indices, data=data))
             sliced_buckets.append(sliced_bucket)
 
         return sliced_buckets
 
     def get_result_data(self, result):
-        return result.data
+        return torch.permute(result.data, tuple(reversed(range(result.data.ndim))))
diff --git a/qtensor/merged_indices/__init__.py b/qtensor/merged_indices/__init__.py
deleted file mode 100644
index 5d2dadbf..00000000
--- a/qtensor/merged_indices/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .bucket_elimination import bucket_elimination
diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py
index e53f68d8..dfa38957 100644
--- a/qtensor/optimisation/Optimizer.py
+++ b/qtensor/optimisation/Optimizer.py
@@ -15,7 +15,7 @@
 
 
 class Optimizer:
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         raise NotImplementedError
 
     def _get_ordering(self, graph: nx.Graph, inplace=True):
@@ -27,7 +27,7 @@ def _get_ordering(self, graph: nx.Graph, inplace=True):
         """
         node_names = nx.get_node_attributes(graph, 'name')
         node_sizes = nx.get_node_attributes(graph, 'size')
-        peo, path = self._get_ordering_ints(graph, inplace=inplace)
+        peo, path = self.get_ordering_ints(graph, inplace=inplace)
         # compatibility with slicing
         self.peo_ints = [int(x) for x in peo]
 
@@ -69,7 +69,7 @@ def optimize(self, tensor_net):
 
 class WithoutOptimizer(Optimizer):
 
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         peo = sorted([int(v) for v in graph.nodes()])
         # magic line
         peo = list(reversed(peo))
@@ -77,7 +77,7 @@ def _get_ordering_ints(self, graph, inplace=True):
         return peo, path
 
 class GreedyOptimizer(Optimizer):
-    def _get_ordering_ints(self, graph, free_vars=[]):
+    def get_ordering_ints(self, graph, free_vars=[]):
         #mapping = {a:b for a,b in zip(graph.nodes(), reversed(list(graph.nodes())))}
         #graph = nx.relabel_nodes(graph, mapping)
         peo_ints, path = utils.get_neighbors_peo(graph)
@@ -165,10 +165,12 @@ class SlicesOptimizer(Optimizer):
 
     def __init__(self, tw_bias=2, max_tw=None, max_slice=None
                  , base_ordering='greedy'
+                 , peo_after_slice_strategy='run-again'
                  , **kwargs):
         self.tw_bias = tw_bias
         self.max_tw = max_tw
         self.max_slice = max_slice
+        self.peo_after_slice_strategy = peo_after_slice_strategy
         if isinstance(base_ordering, str):
             self.base_ordering = qtensor.toolbox.get_ordering_algo(base_ordering)
         else:
@@ -187,15 +189,53 @@ def _get_max_tw(self):
         # tw = log(cost/16) = log(cost) - 4
         return int(np.log2(avail)) - 4
 
+    def _update_peo_after_slice(self, p_graph, slice_vars):
+        if self.peo_after_slice_strategy == 'run-again':
+            peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
+        elif self.peo_after_slice_strategy == 'TD-reuse':
+            # Remove sliced vars from TD graph. Then, reconstruct peo from this TD
+            peo_old = self.peo_ints
+            peo_ints = [i for i in peo_old if i not in slice_vars]
+            nodes, path = qtensor.utils.get_neighbors_path(p_graph, peo_ints)
+            # -- Tree re-peo
+            g_components = list(nx.connected_components(p_graph))
+            print(f"# of components: {len(g_components)}, # of nodes total: {p_graph.number_of_nodes()}, # of nodes per component: {[len(c) for c in g_components]}")
+            from qtree.graph_model.clique_trees import (
+                get_tree_from_peo, get_peo_from_tree)
+            tree = get_tree_from_peo(p_graph, peo_ints)
+            clique_vertices = []
+            print("Calling get_peo_from_tree")
+            # ---- re-create peo from tree
+            peo_recreate = []
+            components = list(nx.connected_components(tree))
+            print("# of components: ", len(components))
+            for subtree in components:
+                peo_recreate += get_peo_from_tree(tree.subgraph(subtree).copy(), clique_vertices=clique_vertices)
+            # ----
+            nodes, path_recreate = qtensor.utils.get_neighbors_path(p_graph, peo_recreate)
+            log.info(f"Re-created peo width from tree: {max(path_recreate)}")
+            if max(path_recreate) < max(path):
+                log.info("Re-created peo is better than old peo. Using new peo.")
+                peo_ints = peo_recreate
+                path = path_recreate
+            # --
+
+        else:
+            raise ValueError('Unknown peo_after_slice_strategy: {}'
+                             .format(self.peo_after_slice_strategy))
+
+        self.peo_ints = peo_ints
+        self.treewidth = max(path)
+        log.info('Treewidth after slice: {}', self.treewidth)
+        return peo_ints, path
+
     def _split_graph(self, p_graph, max_tw):
-        peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
         searcher = GreedyParvars(p_graph)
         while True:
             #nodes, path = utils.get_neighbors_path(graph, peo=peo_ints)
             tw = self.treewidth
-            log.info('Treewidth: {}', tw)
             if tw < max_tw:
-                log.info('Found parvars: {}', searcher.result)
+                log.info(f'Found {len(searcher.result)} parvars: {searcher.result}')
                 break
             if self.max_slice is not None:
                 if len(searcher.result) > self.max_slice:
@@ -207,22 +247,21 @@ def _split_graph(self, p_graph, max_tw):
                 log.error('Memory is not enough. Max tw: {}', max_tw)
                 raise Exception('Estimated OOM')
 
-            peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
-            self.treewidth = max(path)
+            self._update_peo_after_slice(p_graph, searcher.result)
 
-        return peo_ints, searcher.result
+        return self.peo_ints, searcher.result
 
     def optimize(self, tensor_net):
         peo, tn = super().optimize(tensor_net)
         return peo+self.parallel_vars, self.parallel_vars, tn
 
-    def _get_ordering_ints(self, graph, inplace=True):
+    def get_ordering_ints(self, graph, inplace=True):
         p_graph = copy.deepcopy(graph)
         max_tw = self._get_max_tw()
         max_tw = max_tw - self.tw_bias
         log.info('Maximum treewidth: {}', max_tw)
 
-        self.peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
+        self.peo_ints, path = self.base_ordering.get_ordering_ints(p_graph)
         self.treewidth = max(path)
         peo, par_vars = self._split_graph(p_graph, max_tw)
 
@@ -235,24 +274,26 @@ def _get_ordering_ints(self, graph, inplace=True):
         #log.info('peo {}', self.peo)
         return peo, [self.treewidth]
 
-class TamakiOptimizer(GreedyOptimizer):
+class TamakiOptimizer(Optimizer):
     def __init__(self, max_width=None, *args, wait_time=5, **kwargs):
         super().__init__(*args, **kwargs)
         self.wait_time = wait_time
         self.max_width = max_width
 
-    def _get_ordering(self, graph, inplace=True):
-        node_names = nx.get_node_attributes(graph, 'name')
-        node_sizes = nx.get_node_attributes(graph, 'size')
+    def get_ordering_ints(self, graph, inplace=True):
         peo, tw = qtree.graph_model.peo_calculation.get_upper_bound_peo_pace2017_interactive(
                 graph, method="tamaki", max_time=self.wait_time, max_width=self.max_width)
+        return peo, [tw]
 
-
+    def _get_ordering(self, graph, inplace=True):
+        node_names = nx.get_node_attributes(graph, 'name')
+        node_sizes = nx.get_node_attributes(graph, 'size')
+        peo, path = self.get_ordering_ints(graph, inplace=inplace)
         peo = [qtree.optimizer.Var(var, size=node_sizes[var],
                         name=node_names[var])
                     for var in peo]
-        self.treewidth = tw
-        return peo, [tw]
+        self.treewidth = max(path)
+        return peo, path
 
 class TamakiExactOptimizer(GreedyOptimizer):
     def __init__(self, *args, **kwargs):
@@ -277,7 +318,7 @@ def _split_graph(self, p_graph, max_tw):
         peo_ints = self.peo_ints
         tw = self.treewidth
         self._slice_hist = []
-        self._slice_hist.append([0, tw])
+        self._slice_hist.append([0, tw, peo_ints])
         log.info('Treewidth: {}', tw)
         log.info('Target treewidth: {}', max_tw)
         result = []
@@ -311,14 +352,10 @@ def _split_graph(self, p_graph, max_tw):
             pv_cnt = len(result)
             log.info('Parvars count: {}. Amps count: {}', pv_cnt, 2**pv_cnt)
 
-            peo_ints, path = self.base_ordering._get_ordering_ints(p_graph)
+            peo_ints, path = self._update_peo_after_slice(p_graph, result)
             tw = max(path)
-            log.info('Treewidth: {}', tw)
-            self._slice_hist.append([pv_cnt, tw])
-
+            self._slice_hist.append([pv_cnt, tw, peo_ints])
             delta = tw - max_tw
-            self.treewidth = tw
-
 
         return peo_ints, result
 
diff --git a/qtensor/optimisation/RGreedy.py b/qtensor/optimisation/RGreedy.py
index 86a4dd02..ef46973d 100644
--- a/qtensor/optimisation/RGreedy.py
+++ b/qtensor/optimisation/RGreedy.py
@@ -34,7 +34,7 @@ def _get_ordering(self, graph, **kwargs):
         #graph = nx.convert_node_labels_to_integers(graph)
         node_names = nx.get_node_attributes(graph, 'name')
         node_sizes = nx.get_node_attributes(graph, 'size')
-        peo, path = self._get_ordering_ints(graph)
+        peo, path = self.get_ordering_ints(graph)
 
         peo = [qtree.optimizer.Var(var, size=node_sizes[var],
                         name=node_names[var])
@@ -42,7 +42,7 @@ def _get_ordering(self, graph, **kwargs):
         #print('tw=', max(path))
         return peo, path
 
-    def _get_ordering_ints(self, old_graph, free_vars=[]):
+    def get_ordering_ints(self, old_graph, free_vars=[]):
         best_peo = None
         best_width = np.inf
         best_widths = None
@@ -94,7 +94,7 @@ def _get_ordering_ints(self, old_graph, free_vars=[]):
 
 class RGreedyOptimizerNk(RGreedyOptimizer):
 
-    def _get_ordering_ints(self, old_graph, free_vars=[]):
+    def get_ordering_ints(self, old_graph, free_vars=[]):
         best_peo = None
         best_width = np.inf
         best_widths = None
diff --git a/qtensor/optimisation/__init__.py b/qtensor/optimisation/__init__.py
index f36361c1..b9323eb5 100644
--- a/qtensor/optimisation/__init__.py
+++ b/qtensor/optimisation/__init__.py
@@ -2,7 +2,7 @@
 from qtensor.optimisation.Optimizer import TamakiTrimSlicing, TamakiOptimizer
 
 from qtensor.optimisation.Optimizer import GreedyOptimizer, WithoutOptimizer
-from qtensor.optimisation.Optimizer import Optimizer, SlicesOptimizer
+from qtensor.optimisation.Optimizer import Optimizer, SlicesOptimizer, TreeTrimSplitter
 from qtensor.optimisation.Greedy import GreedyParvars
 from qtensor.optimisation.late_parallelisation import LateParOptimizer
 
diff --git a/qtensor/optimisation/adaptive.py b/qtensor/optimisation/adaptive.py
index 3381017f..10864347 100644
--- a/qtensor/optimisation/adaptive.py
+++ b/qtensor/optimisation/adaptive.py
@@ -58,56 +58,55 @@ def __init__(self, max_time=np.inf, opt_sim_ratio=1.5):
         self.max_time = max_time
         self.opt_sim_ratio = opt_sim_ratio
 
-    def log_progress(self, rt,  opt, etime):
-        width = opt.treewidth
+    def log_progress(self, rt, opt, etime, width):
         opt_name = opt.__class__.__name__
         if hasattr(self, 'verbose'):
             print(f"Qtensor adaptive optimizer: Time={rt:.4f}, width={width}, optimizer={opt_name}, expected contraction time={etime}")
 
-    def optimize(self, tensor_net):
+    def get_ordering_ints(self, graph, inplace=False):
         start = time.time()
         naive = WithoutOptimizer()
         # first, optimize with naive ordering and check treewidth
-        res = naive.optimize(tensor_net)
+        peo, path = naive.get_ordering_ints(graph)
+        width = max(path)
 
-        e1 = expected_contraction_time(naive.treewidth)
-        self.log_progress(time.time()-start, naive, e1)
+        e1 = expected_contraction_time(width)
+        self.log_progress(time.time()-start, naive, e1, width)
 
         if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-            self.treewidth = naive.treewidth
-            return res
+            return peo, path
 
 
         # Next, greedy
         opt = GreedyOptimizer()
-        res = opt.optimize(tensor_net)
+        peo, path = opt.get_ordering_ints(graph)
+        width = max(path)
 
-        e1 = expected_contraction_time(opt.treewidth)
-        self.log_progress(time.time()-start, opt, e1)
+        e1 = expected_contraction_time(width)
+        self.log_progress(time.time()-start, opt, e1, width)
 
         if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-            self.treewidth = opt.treewidth
-            return res
+            return peo, path
 
 
         # Next, rgreedy
-        rgreedy_time = expected_contraction_time(opt.treewidth-1)
+        rgreedy_time = expected_contraction_time(width-1)
         while rgreedy_time<5:
             opt = RGreedyOptimizer(temp=.02, max_time=rgreedy_time)
-            res = opt.optimize(tensor_net)
+            peo, path = opt.get_ordering_ints(graph)
+            width = max(path)
 
-            e1 = expected_contraction_time(opt.treewidth)
-            self.log_progress(time.time()-start, opt, e1)
+            e1 = expected_contraction_time(width)
+            self.log_progress(time.time()-start, opt, e1, width)
 
             if not should_optimize_more(e1, time.time()-start, self.opt_sim_ratio):
-                self.treewidth = opt.treewidth
-                return res
+                return peo, path
 
-            rgreedy_time = expected_contraction_time(opt.treewidth-1)
+            rgreedy_time = expected_contraction_time(width-1)
 
         # Next, Tamaki
         max_simulatable = 32
-        width = min(max_simulatable, opt.treewidth-1)
+        target_width = min(max_simulatable, width-1)
         while True:
             # terminate if reached max time - 1. No sense in running tamaki for 1 second
             # at this scale.
@@ -115,7 +114,7 @@ def optimize(self, tensor_net):
             if spent_so_far > self.max_time:
                 print("Adaptive ordering algo exceeded budget of",
                       f"{self.max_time} seconds. Returning prematurely")
-                return res
+                return peo, path
             wait_time = min(
                 expected_contraction_time(width),
                 # reserve a second for tamaki overhead
@@ -124,29 +123,31 @@ def optimize(self, tensor_net):
             # Tamaki may fail to process very large graphs if the budget is too small
             wait_time += 1
 
-            opt = TamakiOptimizer(max_width=width, wait_time=wait_time)
+            opt = TamakiOptimizer(max_width=target_width, wait_time=wait_time)
             # Detect termination reason. 
             # If terminated because reached max_width, then reduce the width
             # Othervise need more time
             start_opt = time.time()
-            t_out = opt.optimize(tensor_net)
+            t_peo, t_path = opt.get_ordering_ints(graph)
+            t_width = max(t_path)
             opt_duration = time.time() - start_opt
             # Record result if it's better than what we already have
             # (Sometimes it can decrease if we are close to time budget)
-            if opt.treewidth <= width:
-                res = t_out
+            if t_width <= target_width:
+                peo = t_peo
+                path = t_path
+                width = t_width
 
-            self.treewidth = opt.treewidth
-            e1 = expected_contraction_time(opt.treewidth)
-            self.log_progress(time.time()-start, opt, e1)
+            e1 = expected_contraction_time(width)
+            self.log_progress(time.time()-start, opt, e1, width)
 
             if not should_optimize_more(e1, time.time() - start, self.opt_sim_ratio):
-                return res
+                return peo, path
 
             # Do not reduce target treewidth if failed to converge to the previous one.
             if opt_duration < wait_time - 1:
-                width = opt.treewidth - 1
+                target_width = width - 1
 
 
-        return res
+        return peo, path
 
diff --git a/qtensor/optimisation/late_parallelisation.py b/qtensor/optimisation/late_parallelisation.py
index 7ce2a3ce..121b99d0 100644
--- a/qtensor/optimisation/late_parallelisation.py
+++ b/qtensor/optimisation/late_parallelisation.py
@@ -17,7 +17,7 @@ def slice_greedy(graph, p_bunch, ordering_algo='greedy'):
     """ Slice greedy and inplece """
     orderer = qtn.toolbox.get_ordering_algo(ordering_algo)
     searcher = GreedyParvars(graph)
-    peo_ints, path = orderer._get_ordering_ints(graph)
+    peo_ints, path = orderer.get_ordering_ints(graph)
     for _ in range(p_bunch):
         error = searcher.step()
         pv_cnt = len(searcher.result)
@@ -104,7 +104,7 @@ def find_slice_at_step(self, ordering, graph, p_bunch):
             # Room for optimization: do not copy graph
             sliced_graph = graph.copy()
             slice_vars = self.slicer(sliced_graph, p_bunch=p_bunch)
-            _peo, _path = self.orderer._get_ordering_ints(sliced_graph)
+            _peo, _path = self.orderer.get_ordering_ints(sliced_graph)
             step_tw = qtn.utils.n_neighbors(graph, node) + 1
             largest_tw = max(step_tw, largest_tw)
             _tw = max(largest_tw, max(_path))
@@ -145,7 +145,7 @@ def optimize(self, tensor_net):
         else:
             current_graph = line_graph
 
-        current_ordering, tw_path = self.orderer._get_ordering_ints(current_graph)
+        current_ordering, tw_path = self.orderer.get_ordering_ints(current_graph)
         contraction_schedule = []
         log.info(f"Initial treewidth: {max(tw_path)}")
 
diff --git a/qtensor/tests/__init__.py b/qtensor/tests/__init__.py
index c5d2abba..30d4274c 100644
--- a/qtensor/tests/__init__.py
+++ b/qtensor/tests/__init__.py
@@ -1,7 +1,14 @@
 import networkx as nx
+import qtensor
 import numpy as np
 from functools import lru_cache
 
+def get_test_qaoa_ansatz_circ(n=10, p=2, d=3, type='random'):
+    G, gamma, beta = get_test_problem(n, p, d, type)
+    composer = qtensor.DefaultQAOAComposer(
+        graph=G, gamma=gamma, beta=beta)
+    composer.ansatz_state()
+    return composer.circuit
 
 @lru_cache(maxsize=2**12)
 def get_test_problem(n=10, p=2, d=3, type='random'):
diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py
index f5facce9..d26ba8a5 100644
--- a/qtensor/tests/test_bucket_backends.py
+++ b/qtensor/tests/test_bucket_backends.py
@@ -1,10 +1,16 @@
 from qtensor import QtreeQAOAComposer
 from qtensor.contraction_backends import PerfNumpyBackend
-
+from qtensor.contraction_backends import CuPyBackend, NumpyBackend, CompressionBackend
+from qtensor.compression import NumpyCompressor, CUSZCompressor
 from qtensor.Simulate import CirqSimulator, QtreeSimulator
+
+import pytest
+import qtensor
 import numpy as np
 import networkx as nx
-from qtensor.tests import get_test_problem
+from qtensor.tests import get_test_problem, get_test_qaoa_ansatz_circ
+
+from qtensor.contraction_algos import is_reverse_order_backend
 
 
 def test_profiled(capsys):
@@ -25,3 +31,37 @@ def test_profiled(capsys):
     qtree_amp = result
 
     assert qtree_amp
+
+def test_reverse_order_switch():
+    backend = qtensor.contraction_backends.get_backend('torch')
+    reverse = is_reverse_order_backend(backend)
+    assert reverse
+
+    backend = qtensor.contraction_backends.get_backend('einsum')
+    reverse = is_reverse_order_backend(backend)
+    assert not reverse
+
+ref_backend_name = 'cupy'
+@pytest.mark.parametrize('circ', [
+    get_test_qaoa_ansatz_circ(n=6, p=3),
+    get_test_qaoa_ansatz_circ(n=12, p=4),
+])
+@pytest.mark.parametrize(['backend', 'atol'], [
+    ('cupy', 1e-10),
+    ('torch', 1e-10),
+    ('cupy_compressed', 1e-10),
+    (CompressionBackend(
+        CuPyBackend(),
+        CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5),
+        11 ),
+        1e-5)
+])
+def test_backends(circ, backend, atol):
+    ref_backend = qtensor.contraction_backends.get_backend(ref_backend_name)
+    if isinstance(backend, str):
+        backend = qtensor.contraction_backends.get_backend(backend)
+    sim = QtreeSimulator(backend=backend)
+    res = sim.simulate(circ)
+    sim_ref = QtreeSimulator(backend=ref_backend)
+    res_ref = sim_ref.simulate(circ)
+    assert np.allclose(res, res_ref, atol=atol)
diff --git a/qtensor/tools/benchmarking/simulators.py b/qtensor/tools/benchmarking/simulators.py
index 659bc775..fc06126f 100644
--- a/qtensor/tools/benchmarking/simulators.py
+++ b/qtensor/tools/benchmarking/simulators.py
@@ -299,7 +299,7 @@ def simulate_qaoa_energy(self, G, p, opt):
             with profiles.mem_util() as m:
                 # should not consume all iterator at once
                 for edge, (ibunch, merged_buckets) in zip(self.iterate_edges(G, p), opt):
-                    edge_contribution = qtensor.merged_indices.bucket_elimination(
+                    edge_contribution = qtensor.contraction_algos.merged_bucket_elimination(
                         [x.copy() for x in merged_buckets],
                         ibunch,
                         sim.backend.process_bucket_merged,
diff --git a/qtree b/qtree
index bfe253df..7b038d5a 160000
--- a/qtree
+++ b/qtree
@@ -1 +1 @@
-Subproject commit bfe253df1cbaae6af0a5fd5198f237f3654819d6
+Subproject commit 7b038d5a4cc1f9b5e0ede4b0e5740bff4b22153e
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 2279da1b..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-mongocat
-PyInquirer
diff --git a/setup.py b/setup.py
index be3da9b8..641d2156 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
     ,'click'
     ,'qtensor-qtree'
     ,'lazy-import'
-    ,'pynauty-nice'
+    ,'pynauty'
     ,'sarge'
     ,'cartesian-explorer'
 
diff --git a/test-requirements.txt b/test-requirements.txt
new file mode 100644
index 00000000..720fe010
--- /dev/null
+++ b/test-requirements.txt
@@ -0,0 +1,5 @@
+mongocat
+pytest-xdist
+cirq
+qiskit
+tabulate