-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add chunks='auto' support for cftime datasets #10527
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eb1a967
852476d
c921c59
1aba531
9429c3d
3c9d27e
5153d2d
62e71e6
cfdc31b
2f16bc7
ce720fa
4fa58c1
e58d6d7
590e503
f953976
6706524
4e56acd
0d008cd
49c4e9c
4594099
5d00b0a
80421ef
d1f7ad3
1b7de62
4407185
d8f45b2
20226c1
11ac9f0
8485df5
2c27877
0983261
c4ec31f
adbf5b2
6c93bc4
74bc0ea
0b9bbd0
e58322f
dbc6ebd
5680663
b5933ed
5db9225
600c0fd
9fcc6eb
dc83692
1e1bbf3
9443815
db52c62
85ebafd
e2627c6
0bca828
a930a65
cbcb640
70208e0
3f0d3aa
e944eb4
1393351
90242d1
92bb538
1e3a015
861cc57
16ccc78
1bd2f32
9dead77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,12 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import importlib | ||
| import itertools | ||
| import sys | ||
| import warnings | ||
| from collections.abc import Hashable, Iterable, Iterator, Mapping | ||
| from functools import lru_cache | ||
| from numbers import Number | ||
| from typing import TYPE_CHECKING, Any, TypeVar, cast | ||
|
|
||
| import numpy as np | ||
|
|
@@ -23,7 +26,9 @@ | |
| DaskArray = NDArray # type: ignore[assignment, misc] | ||
| DaskCollection: Any = NDArray # type: ignore[no-redef] | ||
|
|
||
| from xarray.namedarray._typing import _Dim, duckarray | ||
| from xarray.core.types import T_ChunkDim | ||
| from xarray.namedarray._typing import DuckArray, _Dim, duckarray | ||
| from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint | ||
|
|
||
|
|
||
| K = TypeVar("K") | ||
|
|
@@ -195,6 +200,106 @@ def either_dict_or_kwargs( | |
| return pos_kwargs | ||
|
|
||
|
|
||
| def _get_chunk( # type: ignore[no-untyped-def] | ||
| data: DuckArray[Any], | ||
| chunks, | ||
| chunkmanager: ChunkManagerEntrypoint[Any], | ||
| *, | ||
| preferred_chunks, | ||
| dims=None, | ||
| ) -> Mapping[Any, T_ChunkDim]: | ||
| """ | ||
| Return map from each dim to chunk sizes, accounting for backend's preferred chunks. | ||
| """ | ||
| from xarray.core.common import _contains_cftime_datetimes | ||
| from xarray.core.utils import emit_user_level_warning | ||
| from xarray.structure.chunks import _get_breaks_cached | ||
|
|
||
| dims = chunks.keys() if dims is None else dims | ||
| shape = data.shape | ||
|
|
||
| # Determine the explicit requested chunks. | ||
| preferred_chunk_shape = tuple( | ||
| itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) | ||
| ) | ||
| if isinstance(chunks, Number) or (chunks == "auto"): | ||
| chunks = dict.fromkeys(dims, chunks) | ||
| chunk_shape = tuple( | ||
| chunks.get(dim, None) or preferred_chunk_sizes | ||
| for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True) | ||
| ) | ||
|
|
||
| limit: int | None | ||
| if _contains_cftime_datetimes(data): | ||
| limit, dtype = fake_target_chunksize(data, chunkmanager.get_auto_chunk_size()) | ||
| else: | ||
| limit = None | ||
| dtype = data.dtype | ||
|
|
||
| chunk_shape = chunkmanager.normalize_chunks( | ||
| chunk_shape, | ||
| shape=shape, | ||
| dtype=dtype, | ||
| limit=limit, | ||
| previous_chunks=preferred_chunk_shape, | ||
| ) | ||
|
Comment on lines
+233
to
+245
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this seem fine to you @charles-turner-1 . I wanted to avoid calling
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, looks good! I'm guessing you meant calling |
||
|
|
||
| # Warn where requested chunks break preferred chunks, provided that the variable | ||
| # contains data. | ||
| if data.size: # type: ignore[unused-ignore,attr-defined] # DuckArray protocol doesn't include 'size' - should it? | ||
| for dim, size, chunk_sizes in zip(dims, shape, chunk_shape, strict=True): | ||
| if preferred_chunk_sizes := preferred_chunks.get(dim): | ||
| disagreement = _get_breaks_cached( | ||
| size=size, | ||
| chunk_sizes=chunk_sizes, | ||
| preferred_chunk_sizes=preferred_chunk_sizes, | ||
| ) | ||
| if disagreement: | ||
| emit_user_level_warning( | ||
| "The specified chunks separate the stored chunks along " | ||
| f'dimension "{dim}" starting at index {disagreement}. This could ' | ||
| "degrade performance. Instead, consider rechunking after loading.", | ||
| ) | ||
|
|
||
| return dict(zip(dims, chunk_shape, strict=True)) | ||
|
|
||
|
|
||
| def fake_target_chunksize( | ||
| data: DuckArray[Any], | ||
| limit: int, | ||
| ) -> tuple[int, np.dtype[Any]]: | ||
| """ | ||
| The `normalize_chunks` algorithm takes a size `limit` in bytes, but will not | ||
| work for object dtypes. So we rescale the `limit` to an appropriate one based | ||
| on `float64` dtype, and pass that to `normalize_chunks`. | ||
|
|
||
| Arguments | ||
| --------- | ||
| data : Variable or ChunkedArray | ||
| The data for which we want to determine chunk sizes. | ||
| limit : int | ||
| The target chunk size in bytes. Passed to the chunk manager's `normalize_chunks` method. | ||
| """ | ||
|
|
||
| # Short circuit for non-object dtypes | ||
| from xarray.core.common import _contains_cftime_datetimes | ||
|
|
||
| if not _contains_cftime_datetimes(data): | ||
| return limit, data.dtype | ||
|
|
||
| from xarray.core.formatting import first_n_items | ||
|
|
||
| output_dtype = np.dtype(np.float64) | ||
|
|
||
| nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm just came across this and I'm not quite sure it's the right size. I think import sys
import numpy as np
import cftime
np.dtype(np.float64).itemsize # 8
sys.getsizeof(np.float64(1.0)) # 32
sys.getsizeof(np.array([1.0], dtype=np.float64)) # 120
sys.getsizeof(cftime.DatetimeGregorian.fromordinal(2450000)) #112
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm kind of wondering if setting the dtype to |
||
|
|
||
| f64_nbytes = output_dtype.itemsize | ||
|
|
||
| limit = int(limit * (f64_nbytes / nbytes_approx)) | ||
|
|
||
| return limit, output_dtype | ||
|
|
||
|
|
||
| class ReprObject: | ||
| """Object that prints as the given value, for use with sentinel values.""" | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tomwhite is there an equivalent for cubed? I didn't see it in the docs...