|
| 1 | +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import fsspec |
| 6 | + |
| 7 | +import awkward as ak |
| 8 | +from awkward._dispatch import high_level_function |
| 9 | + |
| 10 | +__all__ = ("from_safetensors",) |
| 11 | + |
| 12 | + |
| 13 | +@high_level_function() |
| 14 | +def from_safetensors( |
| 15 | + source, |
| 16 | + *, |
| 17 | + storage_options=None, |
| 18 | + virtual=False, |
| 19 | + # ak.from_buffers kwargs |
| 20 | + buffer_key="{form_key}-{attribute}", |
| 21 | + backend="cpu", |
| 22 | + byteorder="<", |
| 23 | + allow_noncanonical_form=False, |
| 24 | + highlevel=True, |
| 25 | + behavior=None, |
| 26 | + attrs=None, |
| 27 | +): |
| 28 | + """ |
| 29 | + Args: |
| 30 | + source (path-like): Name of the input file, file path, or |
| 31 | + remote URL passed to [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) |
| 32 | + for remote reading. |
| 33 | + storage_options (None or dict): Any additional options to pass to |
| 34 | + [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) |
| 35 | + to open a remote file for reading. |
| 36 | + virtual (bool, optional): If True, create a virtual (lazy) Awkward Array |
| 37 | + that references buffers without materializing them. Defaults to False. |
| 38 | + buffer_key (str, optional): Template for buffer names, with placeholders |
| 39 | + `{form_key}` and `{attribute}`. Defaults to "{form_key}-{attribute}". |
| 40 | + backend (str, optional): Backend identifier (e.g., "cpu"). Defaults to "cpu". |
| 41 | + byteorder (str, optional): Byte order, "<" (little-endian, default) or ">". |
| 42 | + allow_noncanonical_form (bool, optional): If True, normalize |
| 43 | + safetensors forms that do not directly match Awkward. Defaults to False. |
| 44 | + highlevel (bool, optional): If True, return a high-level ak.Array. If False, |
| 45 | + return the low-level layout. Defaults to True. |
| 46 | + behavior (Mapping | None, optional): Optional Awkward behavior mapping. |
| 47 | + attrs (Mapping | None, optional): Optional metadata to attach to the array. |
| 48 | +
|
| 49 | + Returns: |
| 50 | + ak.Array or ak.layout.Content: An Awkward Array (or layout) reconstructed |
| 51 | + from the safetensors buffers. |
| 52 | +
|
| 53 | + Load a safetensors file as an Awkward Array. |
| 54 | +
|
| 55 | + Ref: https://huggingface.co/docs/safetensors/. |
| 56 | +
|
| 57 | + This function reads data serialized in the safetensors format and reconstructs |
| 58 | + an Awkward Array (or low-level layout) from it. Buffers in the safetensors file |
| 59 | + are mapped to Awkward buffers according to the `buffer_key` template, and |
| 60 | + optional behavior or attributes can be attached to the returned array. |
| 61 | +
|
| 62 | + The safetensors file **must contain** `form` and `length` entries in its |
| 63 | + metadata, which define the structure and length of the reconstructed array. |
| 64 | +
|
| 65 | + Example: |
| 66 | +
|
| 67 | + >>> import awkward as ak |
| 68 | + >>> arr = ak.from_safetensors("out.safetensors") |
| 69 | + >>> arr # doctest: +SKIP |
| 70 | + <Array [[1, 2, 3], [], [4]] type='3 * var * int64'> |
| 71 | +
|
| 72 | + Create a virtual (lazy) array that references buffers without materializing them: |
| 73 | +
|
| 74 | + >>> virtual_arr = ak.from_safetensors("out.safetensors", virtual=True) |
| 75 | + >>> virtual_arr # doctest: +SKIP |
| 76 | + <Array [??, ??, ??] type='3 * var * int64'> |
| 77 | +
|
| 78 | +
|
| 79 | + See also #ak.to_safetensors. |
| 80 | + """ |
| 81 | + # Implementation |
| 82 | + return _impl( |
| 83 | + source, |
| 84 | + storage_options, |
| 85 | + virtual, |
| 86 | + buffer_key, |
| 87 | + backend, |
| 88 | + byteorder, |
| 89 | + allow_noncanonical_form, |
| 90 | + highlevel, |
| 91 | + behavior, |
| 92 | + attrs, |
| 93 | + ) |
| 94 | + |
| 95 | + |
| 96 | +def _impl( |
| 97 | + source, |
| 98 | + storage_options, |
| 99 | + virtual, |
| 100 | + buffer_key, |
| 101 | + backend, |
| 102 | + byteorder, |
| 103 | + allow_noncanonical_form, |
| 104 | + highlevel, |
| 105 | + behavior, |
| 106 | + attrs, |
| 107 | +): |
| 108 | + try: |
| 109 | + from safetensors import _safe_open_handle |
| 110 | + except ImportError as err: |
| 111 | + raise ImportError( |
| 112 | + """to use ak.from_tensorflow, you must install the 'safetensors' package with: |
| 113 | +
|
| 114 | + pip install safetensors |
| 115 | +or |
| 116 | + conda install -c huggingface safetensors""" |
| 117 | + ) from err |
| 118 | + |
| 119 | + fs, source = fsspec.core.url_to_fs(source, **(storage_options or {})) |
| 120 | + |
| 121 | + buffers = {} |
| 122 | + |
| 123 | + def maybe_virtualize(x): |
| 124 | + return (lambda: x) if virtual else x |
| 125 | + |
| 126 | + with fs.open(source, "rb") as f: |
| 127 | + with _safe_open_handle(f, framework="np") as g: |
| 128 | + metadata = g.metadata() |
| 129 | + for k in g.offset_keys(): |
| 130 | + buffers[k] = maybe_virtualize(g.get_tensor(k)) |
| 131 | + |
| 132 | + if "form" not in metadata or "length" not in metadata: |
| 133 | + raise RuntimeError( |
| 134 | + "Missing required metadata in safetensors file: 'form' and 'length' are required." |
| 135 | + ) |
| 136 | + form = ak.forms.from_json(metadata["form"]) |
| 137 | + length = int(metadata["length"]) |
| 138 | + |
| 139 | + # reconstruct array |
| 140 | + return ak.ak_from_buffers._impl( |
| 141 | + form, |
| 142 | + length, |
| 143 | + buffers, |
| 144 | + buffer_key=buffer_key, |
| 145 | + backend=backend, |
| 146 | + byteorder=byteorder, |
| 147 | + simplify=allow_noncanonical_form, |
| 148 | + highlevel=highlevel, |
| 149 | + behavior=behavior, |
| 150 | + attrs=attrs, |
| 151 | + ) |
0 commit comments