From d3a54fa358b0b92d73a94556bd3bd2d1905e4fce Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Mon, 23 Feb 2026 15:38:07 +1300 Subject: [PATCH 1/6] feature(compress_vm): Add velocity model compression --- pyproject.toml | 2 + velocity_modelling/tools/compress_vm.py | 207 ++++++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 velocity_modelling/tools/compress_vm.py diff --git a/pyproject.toml b/pyproject.toml index ba4f3ed2..815f1186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "threadpoolctl>=3.3", "tqdm>=4.66", "typer>=0.19", + "xarray[io]>=2025.10.1", ] [project.optional-dependencies] test = [ @@ -58,6 +59,7 @@ generate_thresholds = "velocity_modelling.scripts.generate_thresholds:app" convert_hdf5_to_emod3d = "velocity_modelling.tools.convert_hdf5_to_emod3d:app" nzcvm-data-helper = "velocity_modelling.data_helper:app" hdf5-to-emod3d = "velocity_modelling.tools.convert_hdf5_to_emod3d:app" +compress-vm = "velocity_modelling.tools.compress_vm:app" # Add more scripts as needed: # script_name = "module.path:app" diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py new file mode 100644 index 00000000..7e4cad85 --- /dev/null +++ b/velocity_modelling/tools/compress_vm.py @@ -0,0 +1,207 @@ +from pathlib import Path +from typing import Annotated + +import h5py +import numpy as np +import typer +import xarray as xr + +app = typer.Typer() + + +def get_extrema(h5_dataset: h5py.Dataset) -> tuple[float, float]: + """Get extreme values of hdf5 dataset. + + Parameters + ---------- + h5_dataset : h5py.Dataset + HDF5 dataset to find extrema for. + + Returns + ------- + tuple[float, float] + (min, max) values for dataset. + """ + min_v, max_v = np.inf, -np.inf + + for i in range(h5_dataset.shape[1]): + slice_data = h5_dataset[:, i, :] + min_v = min(min_v, np.nanmin(slice_data)) + max_v = max(max_v, np.nanmax(slice_data)) + return min_v, max_v + + +def compress_quality(file: h5py.File, quality: str) -> xr.DataArray: + """Compress velocity model quality using quantisation method. + + Parameters + ---------- + file : h5py.File + File to read quality from. + quality : str + Quality to read, e.g. rho. + + Returns + ------- + xr.DataArray + A quantised dataarray with uint8 values. The scale factor, and + add offset attributes record the scale of the quantised array + and minimum value, respectively. + """ + quality_array = file["properties"][quality] + shape = quality_array.shape + (nz, ny, nx) = shape + quantised_array = np.zeros(shape, dtype=np.uint8) + int_max = np.iinfo(np.uint8).max + min, max = get_extrema(quality_array) + scale = (max - min) / int_max + for i in range(ny): + # Copy out one y-slice to a local copy. + y_slice = quality_array[:, i, :].astype(np.float32) + # y_slice_quantised = round(y_slice / scale_max) as uint8 + # Need to do this with `out` parameters to avoid extra unneeded copies + np.subtract(y_slice, min, out=y_slice) + np.divide(y_slice, scale, out=y_slice) + np.round(y_slice, out=y_slice) + y_slice_quantised = y_slice.astype(np.uint8) + quantised_array[:, i, :] = y_slice_quantised + + attrs = dict(quality_array.attrs) + attrs["scale_factor"] = scale + attrs["add_offset"] = min + attrs["_FillValue"] = max + z = np.arange(nz) + y = np.arange(ny) + x = np.arange(nx) + + da = xr.DataArray( + quantised_array, + dims=("z", "y", "x"), + coords=dict(z=z, y=y, x=x), + attrs=attrs, + ) + return da + + +def read_inbasin(file: h5py.File) -> xr.DataArray: + """Read inbasin vector from velocity model. + + Parameters + ---------- + file : h5py.File + Velocity model to read from. + + Returns + ------- + xr.DataArray + Data array for inbasin quality. + """ + inbasin = np.array(file["properties"]["inbasin"]) + (nz, ny, nx) = inbasin.shape + z = np.arange(nz) + y = np.arange(ny) + x = np.arange(nx) + attrs = dict(file["properties"]["inbasin"].attrs) + da = xr.DataArray( + inbasin, dims=("z", "y", "x"), coords=dict(z=z, y=y, x=x), attrs=attrs + ) + return da + + +def compressed_vm_as_dataset(file: h5py.File) -> xr.Dataset: + """Convert an HDF5 velocity model into a compressed and quantised xarray dataset. + + Parameters + ---------- + file : h5py.File + Velocity model to quantise. + + Returns + ------- + xr.Dataset + Compressed and quantised dataset representing the read + velocity model. + """ + compressed_vp = compress_quality(file, "vp") + compressed_vs = compress_quality(file, "vs") + compressed_rho = compress_quality(file, "rho") + inbasin = read_inbasin(file) + lat = np.array(file["mesh"]["lat"]) + lon = np.array(file["mesh"]["lon"]) + + z_resolution = float(file["config"].attrs["h_depth"]) + nz = compressed_vp.shape[0] + z = np.arange(nz) * z_resolution + + ds = xr.Dataset( + { + "vp": compressed_vp, + "vs": compressed_vs, + "rho": compressed_rho, + "inbasin": inbasin, + }, + ) + ds.attrs.update(file["config"].attrs) + # Now that the dimensions of the above arrays are consolidated, we can re-use them for the inbasin assignment. + + ds = ds.assign_coords( + dict(lon=(("x", "y"), lon), lat=(("x", "y"), lat), depth=(("z"), z)), + ) + ds = ds.set_coords(["lat", "lon", "depth"]) + return ds + + +@app.command() +def compress_vm( + vm_path: Path, + output: Path, + complevel: Annotated[int, typer.Option(min=1, max=19)] = 4, + chunk_x: Annotated[int | None, typer.Option(min=1)] = 64, + chunk_y: Annotated[int | None, typer.Option(min=1)] = 256, + chunk_z: Annotated[int | None, typer.Option(min=1)] = 256, + shuffle: bool = True, +) -> None: + """Compress a velocity model for archival storage. + + Parameters + ---------- + vm_path : Path + Path to velocity model to compress. + output : Path + Path to store compressed velocity model. + complevel : int + Compression level for zlib compression. + chunk_x : int | None, optional + Chunksize in x direction. Set to ``None`` to infer dataset sive. + chunk_y : int | None, optional + Chunksize in y direction. Set to ``None`` to infer dataset size. + chunk_z : int | None, optional + Chunksize in z direction. Set to ``None`` to infer dataset size. + shuffle : bool + If set, enable bit-level shuffling for dataset compression. + """ + with h5py.File(vm_path) as vm: + dset = compressed_vm_as_dataset(vm) + + common_options = dict( + dtype="uint8", + zlib=True, + complevel=complevel, + shuffle=shuffle, + chunksizes=( + chunk_x or dset.sizes["x"], + chunk_y or dset.sizes["y"], + chunk_z or dset.sizes["z"], + ), + ) + + dset.to_netcdf( + output, + encoding={ + "vp": common_options, + "vs": common_options, + "rho": common_options, + "inbasin": common_options, + }, + engine="h5netcdf", + ) From 3c9c9a7c5d21d0713531286a658eaaf09e5af680 Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Mon, 23 Feb 2026 15:39:16 +1300 Subject: [PATCH 2/6] docs(compress_vm): add module-level docstring --- velocity_modelling/tools/compress_vm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py index 7e4cad85..30864754 100644 --- a/velocity_modelling/tools/compress_vm.py +++ b/velocity_modelling/tools/compress_vm.py @@ -1,3 +1,5 @@ +"""Compress velocity models for archival outputs""" + from pathlib import Path from typing import Annotated From 82cd7647a62fee55dba1b099d621dd11f99a243d Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Mon, 23 Feb 2026 16:08:45 +1300 Subject: [PATCH 3/6] fix(compress_vm): infer correct chunking dimensions Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- velocity_modelling/tools/compress_vm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py index 30864754..638a5a5f 100644 --- a/velocity_modelling/tools/compress_vm.py +++ b/velocity_modelling/tools/compress_vm.py @@ -191,9 +191,9 @@ def compress_vm( complevel=complevel, shuffle=shuffle, chunksizes=( - chunk_x or dset.sizes["x"], - chunk_y or dset.sizes["y"], chunk_z or dset.sizes["z"], + chunk_y or dset.sizes["y"], + chunk_x or dset.sizes["x"], ), ) From 9fc816996bd9ef19145fcfeb9118cf3520cfa0b5 Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Wed, 25 Feb 2026 09:46:07 +1300 Subject: [PATCH 4/6] pr comments Co-authored-by: Claudio <45545396+claudio525@users.noreply.github.com> --- velocity_modelling/tools/compress_vm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py index 638a5a5f..f475012a 100644 --- a/velocity_modelling/tools/compress_vm.py +++ b/velocity_modelling/tools/compress_vm.py @@ -51,8 +51,7 @@ def compress_quality(file: h5py.File, quality: str) -> xr.DataArray: and minimum value, respectively. """ quality_array = file["properties"][quality] - shape = quality_array.shape - (nz, ny, nx) = shape + shape = (nz, ny, nx) = quality_array.shape quantised_array = np.zeros(shape, dtype=np.uint8) int_max = np.iinfo(np.uint8).max min, max = get_extrema(quality_array) @@ -65,8 +64,7 @@ def compress_quality(file: h5py.File, quality: str) -> xr.DataArray: np.subtract(y_slice, min, out=y_slice) np.divide(y_slice, scale, out=y_slice) np.round(y_slice, out=y_slice) - y_slice_quantised = y_slice.astype(np.uint8) - quantised_array[:, i, :] = y_slice_quantised + quantised_array[:, i, :] = y_slice.astype(np.uint8) attrs = dict(quality_array.attrs) attrs["scale_factor"] = scale From ff15d9740b071b9c88f3b451fbe1c7f6f12d0d46 Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Wed, 25 Feb 2026 11:34:24 +1300 Subject: [PATCH 5/6] refactor(compress_vm): make chunking more flexible --- velocity_modelling/tools/compress_vm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py index 30864754..ffbbac87 100644 --- a/velocity_modelling/tools/compress_vm.py +++ b/velocity_modelling/tools/compress_vm.py @@ -1,4 +1,4 @@ -"""Compress velocity models for archival outputs""" +"""Lossy compress velocity models for archival outputs""" from pathlib import Path from typing import Annotated @@ -8,6 +8,8 @@ import typer import xarray as xr +from qcore import cli + app = typer.Typer() @@ -25,9 +27,8 @@ def get_extrema(h5_dataset: h5py.Dataset) -> tuple[float, float]: (min, max) values for dataset. """ min_v, max_v = np.inf, -np.inf - - for i in range(h5_dataset.shape[1]): - slice_data = h5_dataset[:, i, :] + for chunk in range(h5_dataset.iter_chunks()): + slice_data = h5_dataset[chunk] min_v = min(min_v, np.nanmin(slice_data)) max_v = max(max_v, np.nanmax(slice_data)) return min_v, max_v @@ -57,16 +58,16 @@ def compress_quality(file: h5py.File, quality: str) -> xr.DataArray: int_max = np.iinfo(np.uint8).max min, max = get_extrema(quality_array) scale = (max - min) / int_max - for i in range(ny): + for chunk in quality_array.iter_chunks(): # Copy out one y-slice to a local copy. - y_slice = quality_array[:, i, :].astype(np.float32) + y_slice = quality_array[chunk].astype(np.float32) # y_slice_quantised = round(y_slice / scale_max) as uint8 # Need to do this with `out` parameters to avoid extra unneeded copies np.subtract(y_slice, min, out=y_slice) np.divide(y_slice, scale, out=y_slice) np.round(y_slice, out=y_slice) y_slice_quantised = y_slice.astype(np.uint8) - quantised_array[:, i, :] = y_slice_quantised + quantised_array[chunk] = y_slice_quantised attrs = dict(quality_array.attrs) attrs["scale_factor"] = scale @@ -149,11 +150,10 @@ def compressed_vm_as_dataset(file: h5py.File) -> xr.Dataset: ds = ds.assign_coords( dict(lon=(("x", "y"), lon), lat=(("x", "y"), lat), depth=(("z"), z)), ) - ds = ds.set_coords(["lat", "lon", "depth"]) return ds -@app.command() +@cli.from_docstring(app) def compress_vm( vm_path: Path, output: Path, From a68c704ab4cf8d72a3e08a3500002fcc14b59874 Mon Sep 17 00:00:00 2001 From: Jake Faulkner Date: Thu, 26 Feb 2026 09:17:38 +1300 Subject: [PATCH 6/6] fix(compress-vm): miscellaneous fixes --- velocity_modelling/tools/compress_vm.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/velocity_modelling/tools/compress_vm.py b/velocity_modelling/tools/compress_vm.py index 5fb06b96..4d3c5ea8 100644 --- a/velocity_modelling/tools/compress_vm.py +++ b/velocity_modelling/tools/compress_vm.py @@ -27,7 +27,7 @@ def get_extrema(h5_dataset: h5py.Dataset) -> tuple[float, float]: (min, max) values for dataset. """ min_v, max_v = np.inf, -np.inf - for chunk in range(h5_dataset.iter_chunks()): + for chunk in h5_dataset.iter_chunks(): slice_data = h5_dataset[chunk] min_v = min(min_v, np.nanmin(slice_data)) max_v = max(max_v, np.nanmax(slice_data)) @@ -71,7 +71,7 @@ def compress_quality(file: h5py.File, quality: str) -> xr.DataArray: attrs = dict(quality_array.attrs) attrs["scale_factor"] = scale attrs["add_offset"] = min - attrs["_FillValue"] = max + attrs["_FillValue"] = int_max z = np.arange(nz) y = np.arange(ny) x = np.arange(nx) @@ -157,9 +157,9 @@ def compress_vm( vm_path: Path, output: Path, complevel: Annotated[int, typer.Option(min=1, max=19)] = 4, - chunk_x: Annotated[int | None, typer.Option(min=1)] = 64, + chunk_x: Annotated[int | None, typer.Option(min=1)] = 256, chunk_y: Annotated[int | None, typer.Option(min=1)] = 256, - chunk_z: Annotated[int | None, typer.Option(min=1)] = 256, + chunk_z: Annotated[int | None, typer.Option(min=1)] = 64, shuffle: bool = True, ) -> None: """Compress a velocity model for archival storage. @@ -183,16 +183,18 @@ def compress_vm( """ with h5py.File(vm_path) as vm: dset = compressed_vm_as_dataset(vm) - + nz = dset.sizes['z'] + ny = dset.sizes['y'] + nx = dset.sizes['x'] common_options = dict( dtype="uint8", zlib=True, complevel=complevel, shuffle=shuffle, chunksizes=( - chunk_z or dset.sizes["z"], - chunk_y or dset.sizes["y"], - chunk_x or dset.sizes["x"], + min(chunk_z or nz, nz), + min(chunk_y or ny, ny), + min(chunk_x or nx, nx), ), )