Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1359aec
Add performance test of partial shard reads
aldenks May 1, 2025
c726994
WIP Consolidate reads of multiple chunks in the same shard
aldenks Apr 19, 2025
44d9ce4
Add changes/3004.feature.rst
aldenks Jun 2, 2025
009ce6a
Consistently return None on failure and test partial shard read failu…
aldenks Jun 3, 2025
c65cf82
Fix and test for case where some chunks in shard are all fill
aldenks Jul 21, 2025
c7ddb0e
Merge branch 'main' into coalesce-shard-reads
aldenks Jul 22, 2025
501e7a5
Self review
aldenks Jul 22, 2025
12c3308
Removing profiling code masquerading as a skipped test
aldenks Jul 22, 2025
6322ca6
revert change to indexing.py, not needed
aldenks Jul 22, 2025
d9a7842
Add test for duplicate integer indexing into a coalesced group
aldenks Jul 22, 2025
8469e9c
Undo change to fill value when initializing shard arrays
aldenks Jul 22, 2025
baf1062
Undo change to set mypy_path = "src"
aldenks Jul 22, 2025
50d8822
Commenting and revert uncessary changes to files for smaller diff
aldenks Jul 22, 2025
78313aa
Merge branch 'main' into coalesce-shard-reads
d-v-b Aug 6, 2025
6a04238
Merge remote-tracking branch 'upstream/main' into coalesce-shard-reads
aldenks Dec 5, 2025
904240b
remove now redundant cast
aldenks Dec 5, 2025
5283f1a
Document runtime config keys
aldenks Dec 5, 2025
f7d5de3
Improve changelog entry and .rst -> .md
aldenks Dec 5, 2025
f4b2bcc
.coords -> .chunk_coords in _ChunkCoordsByteSlice dataclass
aldenks Dec 5, 2025
ba77e87
Merge branch 'main' into coalesce-shard-reads
aldenks Jan 19, 2026
04c5cde
Update test env in docs/contributing.md
aldenks Jan 19, 2026
43e326f
Move `config.get` calls up into `_decode_partial_single`
aldenks Jan 19, 2026
b71302f
Ensure no change in behavior when ByteGetter.get returns None + comment
aldenks Jan 19, 2026
24f6f1c
Add test_sharing_unit.py, focusing on coallesce behavior, but with ba…
aldenks Jan 19, 2026
6437fb6
Fix typing errors in test_sharing_unit.py
aldenks Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions changes/3004.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Optimizes reading multiple chunks from a shard. Reads of nearby chunks within
the same shard are coalesced to reduce the number of calls to the store.
After any coalescing, the resulting byte ranges are read in parallel.

Coalescing respects two config options. Reads are coalesced if there are fewer
than `sharding.read.coalesce_max_gap_bytes` bytes between chunks and the total
size of the coalesced read is no more than `sharding.read.coalesce_max_bytes`.
2 changes: 1 addition & 1 deletion docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ hatch env show # list all available environments
To verify that your development environment is working, you can run the unit tests for one of the test environments, e.g.:

```bash
hatch env run --env test.py3.12-2.2-optional run-pytest
hatch env run --env test.py3.13-optional run-pytest
```

### Creating a branch
Expand Down
1 change: 1 addition & 0 deletions docs/user-guide/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Configuration options include the following:
- Async and threading options, e.g. `async.concurrency` and `threading.max_workers`
- Selections of implementations of codecs, codec pipelines and buffers
- Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more.
- Control request merging when reading multiple chunks from the same shard with `sharding.read.coalesce_max_gap_bytes` and `sharding.read.coalesce_max_bytes`

For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
first register the implementations in the registry and then select them in the config.
Expand Down
178 changes: 150 additions & 28 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum
from functools import lru_cache
from operator import itemgetter
from typing import TYPE_CHECKING, Any, NamedTuple, cast
from typing import TYPE_CHECKING, Any, NamedTuple

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -37,11 +37,13 @@
from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid
from zarr.core.common import (
ShapeLike,
concurrent_map,
parse_enum,
parse_named_configuration,
parse_shapelike,
product,
)
from zarr.core.config import config
from zarr.core.dtype.npy.int import UInt64
from zarr.core.indexing import (
BasicIndexer,
Expand Down Expand Up @@ -114,9 +116,7 @@ class _ShardIndex(NamedTuple):

@property
def chunks_per_shard(self) -> tuple[int, ...]:
result = tuple(self.offsets_and_lengths.shape[0:-1])
# The cast is required until https://github.com/numpy/numpy/pull/27211 is merged
return cast("tuple[int, ...]", result)
return tuple(self.offsets_and_lengths.shape[0:-1])

def _localize_chunk(self, chunk_coords: tuple[int, ...]) -> tuple[int, ...]:
return tuple(
Expand Down Expand Up @@ -220,9 +220,19 @@ def __iter__(self) -> Iterator[tuple[int, ...]]:
return c_order_iter(self.index.offsets_and_lengths.shape[:-1])


@dataclass(frozen=True)
class _ChunkCoordsByteSlice:
"""Holds a core.indexing.ChunkProjection.chunk_coords and its byte range in a serialized shard."""

chunk_coords: tuple[int, ...]
byte_slice: slice


@dataclass(frozen=True)
class ShardingCodec(
ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin
ArrayBytesCodec,
ArrayBytesCodecPartialDecodeMixin,
ArrayBytesCodecPartialEncodeMixin,
):
"""Sharding codec"""

Expand Down Expand Up @@ -400,32 +410,31 @@ async def _decode_partial_single(
all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}

# reading bytes of all requested chunks
shard_dict: ShardMapping = {}
shard_dict_maybe: ShardMapping | None = {}
if self._is_total_shard(all_chunk_coords, chunks_per_shard):
# read entire shard
shard_dict_maybe = await self._load_full_shard_maybe(
byte_getter=byte_getter,
prototype=chunk_spec.prototype,
chunks_per_shard=chunks_per_shard,
byte_getter, chunk_spec.prototype, chunks_per_shard
)
if shard_dict_maybe is None:
return None
shard_dict = shard_dict_maybe
else:
# read some chunks within the shard
shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
if shard_index is None:
return None
shard_dict = {}
for chunk_coords in all_chunk_coords:
chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords)
if chunk_byte_slice:
chunk_bytes = await byte_getter.get(
prototype=chunk_spec.prototype,
byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]),
)
if chunk_bytes:
shard_dict[chunk_coords] = chunk_bytes
max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
async_concurrency = config.get("async.concurrency")

shard_dict_maybe = await self._load_partial_shard_maybe(
byte_getter,
chunk_spec.prototype,
chunks_per_shard,
all_chunk_coords,
max_gap_bytes,
coalesce_max_bytes,
async_concurrency,
)

if shard_dict_maybe is None:
return None
shard_dict = shard_dict_maybe

# decoding chunks and writing them into the output buffer
await self.codec_pipeline.read(
Expand Down Expand Up @@ -509,7 +518,9 @@ async def _encode_partial_single(

indexer = list(
get_indexer(
selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape)
selection,
shape=shard_shape,
chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape),
)
)

Expand Down Expand Up @@ -624,7 +635,8 @@ def _shard_index_size(self, chunks_per_shard: tuple[int, ...]) -> int:
get_pipeline_class()
.from_codecs(self.index_codecs)
.compute_encoded_size(
16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard)
16 * product(chunks_per_shard),
self._get_index_chunk_spec(chunks_per_shard),
)
)

Expand Down Expand Up @@ -669,7 +681,8 @@ async def _load_shard_index_maybe(
)
else:
index_bytes = await byte_getter.get(
prototype=numpy_buffer_prototype(), byte_range=SuffixByteRequest(shard_index_size)
prototype=numpy_buffer_prototype(),
byte_range=SuffixByteRequest(shard_index_size),
)
if index_bytes is not None:
return await self._decode_shard_index(index_bytes, chunks_per_shard)
Expand All @@ -693,6 +706,115 @@ async def _load_full_shard_maybe(
else None
)

async def _load_partial_shard_maybe(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would happen if we got rid of _load_full_shard_maybe and instead treated loading the full shard as a special case of loading some of the shard?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed it seems like it should be. I started going down this road but

  1. we do want to avoid making an extra request for the shard index if its a full shard
  2. without having the shard index, I don't see a way to go down the partial shard path that doesn't make that code messy

self,
byte_getter: ByteGetter,
prototype: BufferPrototype,
chunks_per_shard: tuple[int, ...],
all_chunk_coords: set[tuple[int, ...]],
max_gap_bytes: int,
coalesce_max_bytes: int,
async_concurrency: int,
) -> ShardMapping | None:
"""
Read chunks from `byte_getter` for the case where the read is less than a full shard.
Returns a mapping of chunk coordinates to bytes or None.

Reads are coalesced if there are fewer than `max_gap_bytes` bytes between chunks
and the total size of the coalesced read is no more than `coalesce_max_bytes`.
"""
shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will fetch the shard index every time. Should we push reading the shard index higher up in the stack, and have this function take the content of the index as a parameter? This might be out of scope for this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(we could also build caching into the _load_shard_index function)

Copy link
Contributor Author

@aldenks aldenks Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed we should do this, but it's not entirely trivial because at this point we only have a ByteGetter that doesn't expose the path to the shard that bytes are being retrieved from which we need as part of the cache key. I'd propose leaving for a follow up (especially since it took me so long to get back this)

if shard_index is None:
return None # shard index read failure, the ByteGetter returned None

chunks = [
_ChunkCoordsByteSlice(chunk_coords, slice(*chunk_byte_slice))
for chunk_coords in all_chunk_coords
# Drop chunks where index lookup fails
# e.g. empty chunks when write_empty_chunks = False
if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
]

groups = self._coalesce_chunks(chunks, max_gap_bytes, coalesce_max_bytes)

shard_dicts = await concurrent_map(
[(group, byte_getter, prototype) for group in groups],
self._get_group_bytes,
async_concurrency,
)

shard_dict: ShardMutableMapping = {}
for d in shard_dicts:
# can be None if the ByteGetter returned None when reading chunk data
if d is not None:
shard_dict.update(d)

return shard_dict

def _coalesce_chunks(
self,
chunks: list[_ChunkCoordsByteSlice],
max_gap_bytes: int,
coalesce_max_bytes: int,
) -> list[list[_ChunkCoordsByteSlice]]:
"""
Combine chunks from a single shard into groups that should be read together
in a single request to the store.
"""
sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)

if len(sorted_chunks) == 0:
return []

groups = []
current_group = [sorted_chunks[0]]

for chunk in sorted_chunks[1:]:
gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
size_if_coalesced = chunk.byte_slice.stop - current_group[0].byte_slice.start
if gap_to_chunk < max_gap_bytes and size_if_coalesced < coalesce_max_bytes:
current_group.append(chunk)
else:
groups.append(current_group)
current_group = [chunk]

groups.append(current_group)

return groups

async def _get_group_bytes(
self,
group: list[_ChunkCoordsByteSlice],
byte_getter: ByteGetter,
prototype: BufferPrototype,
) -> ShardMapping | None:
"""
Reads a possibly coalesced group of one or more chunks from a shard.
Returns a mapping of chunk coordinates to bytes.
"""
# _coalesce_chunks ensures that the group is not empty.
group_start = group[0].byte_slice.start
group_end = group[-1].byte_slice.stop

# A single call to retrieve the bytes for the entire group.
group_bytes = await byte_getter.get(
prototype=prototype,
byte_range=RangeByteRequest(group_start, group_end),
)
if group_bytes is None:
return None

# Extract the bytes corresponding to each chunk in group from group_bytes.
shard_dict = {}
for chunk in group:
chunk_slice = slice(
chunk.byte_slice.start - group_start,
chunk.byte_slice.stop - group_start,
)
shard_dict[chunk.chunk_coords] = group_bytes[chunk_slice]

return shard_dict

def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int:
chunks_per_shard = self._get_chunks_per_shard(shard_spec)
return input_byte_length + self._shard_index_size(chunks_per_shard)
6 changes: 6 additions & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ def enable_gpu(self) -> ConfigSet:
},
"async": {"concurrency": 10, "timeout": None},
"threading": {"max_workers": None},
"sharding": {
"read": {
"coalesce_max_bytes": 100 * 2**20, # 100MiB
"coalesce_max_gap_bytes": 2**20, # 1MiB
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 1MiB default for coalesce_max_gap_bytes is roughly a good choice right now (if you're using object storage via fsspec implementations for example, a very common case). But with the combination of io latencies (networks, ssds, object stores) going down and high performance stores (obstore, icechunk, etc) becoming more popular, im thinking we may want a lower default to set us up for the "lots of small requests barely cost more than a few big ones" world. As the per-request latency goes down, the time to download the extra bytes in a gap between required chunks increases as a fraction of the total time. 256KiB sounds to me like a very conservative default (although I can also see arguments for going all the way to zero). Anyone have Strong Opinions?

}
},
"json_indent": 2,
"codec_pipeline": {
"path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
Expand Down
Loading