Skip to content

Commit e116f63

Browse files
committed
handle run length encoded chunk grid spec in the top level api
1 parent 79ecee9 commit e116f63

File tree

3 files changed

+540
-15
lines changed

3 files changed

+540
-15
lines changed

src/zarr/core/chunk_grids.py

Lines changed: 110 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,67 @@ def _expand_run_length_encoding(spec: Sequence[ChunkEdgeLength]) -> tuple[int, .
109109
return tuple(result)
110110

111111

112+
def _compress_run_length_encoding(chunks: tuple[int, ...]) -> list[int | list[int]]:
113+
"""
114+
Compress a sequence of chunk sizes to RLE format where beneficial.
115+
116+
This function automatically detects runs of identical values and compresses them
117+
using the [value, count] format. Single values or short runs are kept as-is.
118+
119+
Parameters
120+
----------
121+
chunks : tuple[int, ...]
122+
Sequence of chunk sizes along one dimension
123+
124+
Returns
125+
-------
126+
list[int | list[int]]
127+
Compressed representation using RLE where beneficial
128+
129+
Examples
130+
--------
131+
>>> _compress_run_length_encoding((10, 10, 10, 10, 10, 10))
132+
[[10, 6]]
133+
>>> _compress_run_length_encoding((10, 20, 30))
134+
[10, 20, 30]
135+
>>> _compress_run_length_encoding((10, 10, 10, 20, 20, 30))
136+
[[10, 3], [20, 2], 30]
137+
>>> _compress_run_length_encoding((5, 5, 10, 10, 10, 10, 15))
138+
[[5, 2], [10, 4], 15]
139+
"""
140+
if not chunks:
141+
return []
142+
143+
result: list[int | list[int]] = []
144+
current_value = chunks[0]
145+
current_count = 1
146+
147+
for value in chunks[1:]:
148+
if value == current_value:
149+
current_count += 1
150+
else:
151+
# Decide whether to use RLE or explicit value
152+
# Use RLE if count >= 3 to save space (tradeoff: [v,c] vs v,v,v)
153+
if current_count >= 3:
154+
result.append([current_value, current_count])
155+
elif current_count == 2:
156+
# For count=2, RLE doesn't save space, but use it for consistency
157+
result.append([current_value, current_count])
158+
else:
159+
result.append(current_value)
160+
161+
current_value = value
162+
current_count = 1
163+
164+
# Handle the last run
165+
if current_count >= 3 or current_count == 2:
166+
result.append([current_value, current_count])
167+
else:
168+
result.append(current_value)
169+
170+
return result
171+
172+
112173
def _parse_chunk_shapes(
113174
data: Sequence[Sequence[ChunkEdgeLength]],
114175
) -> tuple[tuple[int, ...], ...]:
@@ -554,21 +615,33 @@ def _from_dict(cls, data: dict[str, JSON]) -> Self:
554615

555616
def to_dict(self) -> dict[str, JSON]:
556617
"""
557-
Convert to metadata dict format.
618+
Convert to metadata dict format with automatic RLE compression.
619+
620+
This method automatically compresses chunk shapes using run-length encoding
621+
where beneficial (runs of 2 or more identical values). This reduces metadata
622+
size for arrays with many uniform chunks.
558623
559624
Returns
560625
-------
561626
dict[str, JSON]
562627
Metadata dictionary with 'name' and 'configuration' keys
628+
629+
Examples
630+
--------
631+
>>> grid = RectilinearChunkGrid(chunk_shapes=[[10, 10, 10, 10, 10, 10], [5, 5, 5, 5, 5]])
632+
>>> grid.to_dict()['configuration']['chunk_shapes']
633+
[[[10, 6]], [[5, 5]]]
563634
"""
564-
# Convert to list for JSON serialization
565-
chunk_shapes_list = [list(axis_chunks) for axis_chunks in self.chunk_shapes]
635+
# Compress each dimension using RLE where beneficial
636+
chunk_shapes_compressed = [
637+
_compress_run_length_encoding(axis_chunks) for axis_chunks in self.chunk_shapes
638+
]
566639

567640
return {
568641
"name": "rectilinear",
569642
"configuration": {
570643
"kind": "inline",
571-
"chunk_shapes": chunk_shapes_list,
644+
"chunk_shapes": chunk_shapes_compressed,
572645
},
573646
}
574647

@@ -1116,15 +1189,21 @@ def _is_nested_sequence(chunks: Any) -> bool:
11161189

11171190

11181191
def _normalize_rectilinear_chunks(
1119-
chunks: Sequence[Sequence[int]], shape: tuple[int, ...]
1192+
chunks: Sequence[Sequence[int | Sequence[int]]], shape: tuple[int, ...]
11201193
) -> tuple[tuple[int, ...], ...]:
11211194
"""
11221195
Normalize and validate variable chunks for RectilinearChunkGrid.
11231196
1197+
Supports both explicit chunk sizes and run-length encoding (RLE).
1198+
RLE format: [[value, count]] expands to 'count' repetitions of 'value'.
1199+
11241200
Parameters
11251201
----------
1126-
chunks : Sequence[Sequence[int]]
1202+
chunks : Sequence[Sequence[int | Sequence[int]]]
11271203
Nested sequence where each element is a sequence of chunk sizes along that dimension.
1204+
Each chunk size can be:
1205+
- An integer: explicit chunk size
1206+
- A sequence [value, count]: RLE format (expands to 'count' chunks of size 'value')
11281207
shape : tuple[int, ...]
11291208
The shape of the array.
11301209
@@ -1137,13 +1216,23 @@ def _normalize_rectilinear_chunks(
11371216
------
11381217
ValueError
11391218
If chunks don't match shape or sum incorrectly.
1219+
TypeError
1220+
If chunk specification format is invalid.
1221+
1222+
Examples
1223+
--------
1224+
>>> _normalize_rectilinear_chunks([[10, 20, 30], [25, 25]], (60, 50))
1225+
((10, 20, 30), (25, 25))
1226+
>>> _normalize_rectilinear_chunks([[[10, 6]], [[10, 5]]], (60, 50))
1227+
((10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10))
11401228
"""
1141-
# Convert to tuple of tuples
1229+
# Expand RLE for each dimension
11421230
try:
1143-
chunk_shapes = tuple(tuple(int(c) for c in dim) for dim in chunks)
1231+
chunk_shapes = tuple(_expand_run_length_encoding(dim) for dim in chunks)
11441232
except (TypeError, ValueError) as e:
11451233
raise TypeError(
1146-
f"Invalid variable chunks: {chunks}. Expected nested sequence of integers."
1234+
f"Invalid variable chunks: {chunks}. Expected nested sequence of integers "
1235+
f"or RLE format [[value, count]]."
11471236
) from e
11481237

11491238
# Validate dimensionality
@@ -1179,6 +1268,7 @@ def parse_chunk_grid(
11791268
returns a concrete ChunkGrid instance:
11801269
- ChunkGrid instances: Returned as-is
11811270
- Nested sequences (e.g., [[10, 20], [5, 5]]): Converted to RectilinearChunkGrid (Zarr v3 only)
1271+
- Nested sequences with RLE (e.g., [[[10, 6]], [[10, 5]]]): Expanded and converted to RectilinearChunkGrid
11821272
- Regular tuples/ints (e.g., (10, 10) or 10): Converted to RegularChunkGrid
11831273
- Literal "auto": Computed using auto-chunking heuristics and converted to RegularChunkGrid
11841274
@@ -1187,10 +1277,13 @@ def parse_chunk_grid(
11871277
chunks : tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"] | int
11881278
The chunks parameter to parse. Can be:
11891279
- A ChunkGrid instance
1190-
- A nested sequence for variable-sized chunks
1280+
- A nested sequence for variable-sized chunks (supports RLE format)
11911281
- A tuple of integers for uniform chunks
11921282
- A single integer (for 1D arrays or uniform chunks across all dimensions)
11931283
- The literal "auto"
1284+
1285+
RLE (Run-Length Encoding) format: [[value, count]] expands to 'count' repetitions of 'value'.
1286+
Example: [[[10, 6]]] creates 6 chunks of size 10 each.
11941287
shape : ShapeLike
11951288
The shape of the array. Required to create RegularChunkGrid for "auto" or tuple inputs.
11961289
item_size : int, default=1
@@ -1227,6 +1320,13 @@ def parse_chunk_grid(
12271320
>>> result.chunk_shapes
12281321
((10, 20, 30), (5, 5))
12291322
1323+
>>> # RLE format for RectilinearChunkGrid
1324+
>>> result = parse_chunk_grid([[[10, 6]], [[10, 5]]], shape=(60, 50), zarr_format=3)
1325+
>>> type(result).__name__
1326+
'RectilinearChunkGrid'
1327+
>>> result.chunk_shapes
1328+
((10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10))
1329+
12301330
>>> # Regular tuple
12311331
>>> result = parse_chunk_grid((10, 10), shape=(100, 100))
12321332
>>> type(result).__name__

src/zarr/testing/strategies.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414
from zarr.abc.store import RangeByteRequest, Store
1515
from zarr.codecs.bytes import BytesCodec
1616
from zarr.core.array import Array
17-
from zarr.core.chunk_grids import ChunkGrid, RectilinearChunkGrid, RegularChunkGrid
17+
from zarr.core.chunk_grids import (
18+
ChunkGrid,
19+
RectilinearChunkGrid,
20+
RegularChunkGrid,
21+
_expand_run_length_encoding,
22+
)
1823
from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding
1924
from zarr.core.common import JSON, ZarrFormat
2025
from zarr.core.dtype import get_data_type_from_native_dtype
@@ -568,8 +573,9 @@ def chunk_paths(draw: st.DrawFn, ndim: int, numblocks: tuple[int, ...], subset:
568573
def complex_chunk_grids(draw: st.DrawFn) -> RectilinearChunkGrid:
569574
ndim = draw(st.integers(min_value=1, max_value=3))
570575
nchunks = draw(st.integers(min_value=10, max_value=100))
576+
# Don't require unique chunk sizes - rectilinear grids can have repeated sizes
571577
dim_chunks = st.lists(
572-
st.integers(min_value=1, max_value=10), unique=True, min_size=nchunks, max_size=nchunks
578+
st.integers(min_value=1, max_value=10), min_size=nchunks, max_size=nchunks
573579
)
574580
if draw(st.booleans()):
575581
event("using RectilinearChunkGrid")
@@ -585,7 +591,11 @@ def complex_chunk_grids(draw: st.DrawFn) -> RectilinearChunkGrid:
585591
[[c, r] for c, r in zip(draw(dim_chunks), draw(repeats), strict=True)]
586592
for _ in range(ndim)
587593
]
588-
return RectilinearChunkGrid(chunk_shapes=chunk_shapes_rle)
594+
# Expand RLE to explicit chunk shapes before passing to __init__
595+
chunk_shapes_expanded = [
596+
_expand_run_length_encoding(dim_rle) for dim_rle in chunk_shapes_rle
597+
]
598+
return RectilinearChunkGrid(chunk_shapes=chunk_shapes_expanded)
589599

590600

591601
@st.composite

0 commit comments

Comments
 (0)