@@ -109,6 +109,67 @@ def _expand_run_length_encoding(spec: Sequence[ChunkEdgeLength]) -> tuple[int, .
109109 return tuple (result )
110110
111111
112+ def _compress_run_length_encoding (chunks : tuple [int , ...]) -> list [int | list [int ]]:
113+ """
114+ Compress a sequence of chunk sizes to RLE format where beneficial.
115+
116+ This function automatically detects runs of identical values and compresses them
117+ using the [value, count] format. Single values or short runs are kept as-is.
118+
119+ Parameters
120+ ----------
121+ chunks : tuple[int, ...]
122+ Sequence of chunk sizes along one dimension
123+
124+ Returns
125+ -------
126+ list[int | list[int]]
127+ Compressed representation using RLE where beneficial
128+
129+ Examples
130+ --------
131+ >>> _compress_run_length_encoding((10, 10, 10, 10, 10, 10))
132+ [[10, 6]]
133+ >>> _compress_run_length_encoding((10, 20, 30))
134+ [10, 20, 30]
135+ >>> _compress_run_length_encoding((10, 10, 10, 20, 20, 30))
136+ [[10, 3], [20, 2], 30]
137+ >>> _compress_run_length_encoding((5, 5, 10, 10, 10, 10, 15))
138+ [[5, 2], [10, 4], 15]
139+ """
140+ if not chunks :
141+ return []
142+
143+ result : list [int | list [int ]] = []
144+ current_value = chunks [0 ]
145+ current_count = 1
146+
147+ for value in chunks [1 :]:
148+ if value == current_value :
149+ current_count += 1
150+ else :
151+ # Decide whether to use RLE or explicit value
152+ # Use RLE if count >= 3 to save space (tradeoff: [v,c] vs v,v,v)
153+ if current_count >= 3 :
154+ result .append ([current_value , current_count ])
155+ elif current_count == 2 :
156+ # For count=2, RLE doesn't save space, but use it for consistency
157+ result .append ([current_value , current_count ])
158+ else :
159+ result .append (current_value )
160+
161+ current_value = value
162+ current_count = 1
163+
164+ # Handle the last run
165+ if current_count >= 3 or current_count == 2 :
166+ result .append ([current_value , current_count ])
167+ else :
168+ result .append (current_value )
169+
170+ return result
171+
172+
112173def _parse_chunk_shapes (
113174 data : Sequence [Sequence [ChunkEdgeLength ]],
114175) -> tuple [tuple [int , ...], ...]:
@@ -554,21 +615,33 @@ def _from_dict(cls, data: dict[str, JSON]) -> Self:
554615
555616 def to_dict (self ) -> dict [str , JSON ]:
556617 """
557- Convert to metadata dict format.
618+ Convert to metadata dict format with automatic RLE compression.
619+
620+ This method automatically compresses chunk shapes using run-length encoding
621+ where beneficial (runs of 2 or more identical values). This reduces metadata
622+ size for arrays with many uniform chunks.
558623
559624 Returns
560625 -------
561626 dict[str, JSON]
562627 Metadata dictionary with 'name' and 'configuration' keys
628+
629+ Examples
630+ --------
631+ >>> grid = RectilinearChunkGrid(chunk_shapes=[[10, 10, 10, 10, 10, 10], [5, 5, 5, 5, 5]])
632+ >>> grid.to_dict()['configuration']['chunk_shapes']
633+ [[[10, 6]], [[5, 5]]]
563634 """
564- # Convert to list for JSON serialization
565- chunk_shapes_list = [list (axis_chunks ) for axis_chunks in self .chunk_shapes ]
635+ # Compress each dimension using RLE where beneficial
636+ chunk_shapes_compressed = [
637+ _compress_run_length_encoding (axis_chunks ) for axis_chunks in self .chunk_shapes
638+ ]
566639
567640 return {
568641 "name" : "rectilinear" ,
569642 "configuration" : {
570643 "kind" : "inline" ,
571- "chunk_shapes" : chunk_shapes_list ,
644+ "chunk_shapes" : chunk_shapes_compressed ,
572645 },
573646 }
574647
@@ -1116,15 +1189,21 @@ def _is_nested_sequence(chunks: Any) -> bool:
11161189
11171190
11181191def _normalize_rectilinear_chunks (
1119- chunks : Sequence [Sequence [int ]], shape : tuple [int , ...]
1192+ chunks : Sequence [Sequence [int | Sequence [ int ] ]], shape : tuple [int , ...]
11201193) -> tuple [tuple [int , ...], ...]:
11211194 """
11221195 Normalize and validate variable chunks for RectilinearChunkGrid.
11231196
1197+ Supports both explicit chunk sizes and run-length encoding (RLE).
1198+ RLE format: [[value, count]] expands to 'count' repetitions of 'value'.
1199+
11241200 Parameters
11251201 ----------
1126- chunks : Sequence[Sequence[int]]
1202+ chunks : Sequence[Sequence[int | Sequence[int] ]]
11271203 Nested sequence where each element is a sequence of chunk sizes along that dimension.
1204+ Each chunk size can be:
1205+ - An integer: explicit chunk size
1206+ - A sequence [value, count]: RLE format (expands to 'count' chunks of size 'value')
11281207 shape : tuple[int, ...]
11291208 The shape of the array.
11301209
@@ -1137,13 +1216,23 @@ def _normalize_rectilinear_chunks(
11371216 ------
11381217 ValueError
11391218 If chunks don't match shape or sum incorrectly.
1219+ TypeError
1220+ If chunk specification format is invalid.
1221+
1222+ Examples
1223+ --------
1224+ >>> _normalize_rectilinear_chunks([[10, 20, 30], [25, 25]], (60, 50))
1225+ ((10, 20, 30), (25, 25))
1226+ >>> _normalize_rectilinear_chunks([[[10, 6]], [[10, 5]]], (60, 50))
1227+ ((10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10))
11401228 """
1141- # Convert to tuple of tuples
1229+ # Expand RLE for each dimension
11421230 try :
1143- chunk_shapes = tuple (tuple ( int ( c ) for c in dim ) for dim in chunks )
1231+ chunk_shapes = tuple (_expand_run_length_encoding ( dim ) for dim in chunks )
11441232 except (TypeError , ValueError ) as e :
11451233 raise TypeError (
1146- f"Invalid variable chunks: { chunks } . Expected nested sequence of integers."
1234+ f"Invalid variable chunks: { chunks } . Expected nested sequence of integers "
1235+ f"or RLE format [[value, count]]."
11471236 ) from e
11481237
11491238 # Validate dimensionality
@@ -1179,6 +1268,7 @@ def parse_chunk_grid(
11791268 returns a concrete ChunkGrid instance:
11801269 - ChunkGrid instances: Returned as-is
11811270 - Nested sequences (e.g., [[10, 20], [5, 5]]): Converted to RectilinearChunkGrid (Zarr v3 only)
1271+ - Nested sequences with RLE (e.g., [[[10, 6]], [[10, 5]]]): Expanded and converted to RectilinearChunkGrid
11821272 - Regular tuples/ints (e.g., (10, 10) or 10): Converted to RegularChunkGrid
11831273 - Literal "auto": Computed using auto-chunking heuristics and converted to RegularChunkGrid
11841274
@@ -1187,10 +1277,13 @@ def parse_chunk_grid(
11871277 chunks : tuple[int, ...] | Sequence[Sequence[int]] | ChunkGrid | Literal["auto"] | int
11881278 The chunks parameter to parse. Can be:
11891279 - A ChunkGrid instance
1190- - A nested sequence for variable-sized chunks
1280+ - A nested sequence for variable-sized chunks (supports RLE format)
11911281 - A tuple of integers for uniform chunks
11921282 - A single integer (for 1D arrays or uniform chunks across all dimensions)
11931283 - The literal "auto"
1284+
1285+ RLE (Run-Length Encoding) format: [[value, count]] expands to 'count' repetitions of 'value'.
1286+ Example: [[[10, 6]]] creates 6 chunks of size 10 each.
11941287 shape : ShapeLike
11951288 The shape of the array. Required to create RegularChunkGrid for "auto" or tuple inputs.
11961289 item_size : int, default=1
@@ -1227,6 +1320,13 @@ def parse_chunk_grid(
12271320 >>> result.chunk_shapes
12281321 ((10, 20, 30), (5, 5))
12291322
1323+ >>> # RLE format for RectilinearChunkGrid
1324+ >>> result = parse_chunk_grid([[[10, 6]], [[10, 5]]], shape=(60, 50), zarr_format=3)
1325+ >>> type(result).__name__
1326+ 'RectilinearChunkGrid'
1327+ >>> result.chunk_shapes
1328+ ((10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10))
1329+
12301330 >>> # Regular tuple
12311331 >>> result = parse_chunk_grid((10, 10), shape=(100, 100))
12321332 >>> type(result).__name__
0 commit comments