Move load_npy to C++ (#849)

mthrok · web-flow · commit 5d181f63ec91 · 2025-07-25T07:43:10.000-04:00
So that the GIL is released entirely Performance (based on [data_formats.py](https://github.com/facebookresearch/spdl/blob/c7db5be1512f8c5f17b07b047196cfb22e01624c/examples/data_formats.py)) * QPS of loading NPY files with multi-threading | Concurrency | Baseline | C++ | Improvement | |-------------|----------|------|-------------| | 32 | 1577 | 1891 | 17% | | 16 | 1561 | 1717 | 9% | | 8 | 1592 | 1859 | 14% | | 4 | 1590 | 1812 | 12% | | 2 | 1769 | 1820 | 3% | | 1 | 1690 | 1948 | 13% | <img width="668" height="410" alt="Screenshot 2025-07-25 at 4 59 08 PM" src="https://github.com/user-attachments/assets/d8cbcccf-7607-4b84-8b41-612e6ea8a300" /> * QPS of loading NPZ (no compression) files with multi-threading | Concurrency | Baseline | C++ | Improvement | |-------------|----------|------|-------------| | 32 | 1577 | 1726 | 9% | | 16 | 1495 | 1755 | 15% | | 8 | 1607 | 1754 | 8% | | 4 | 1591 | 1677 | 5% | | 2 | 1637 | 1781 | 8% | | 1 | 1658 | 1829 | 9% | <img width="686" height="420" alt="Screenshot 2025-07-25 at 5 02 11 PM" src="https://github.com/user-attachments/assets/26a99e09-38c4-4f5f-b88b-8f9034b914d9" /> * QPS of loading NPZ (with compression) files with multi-threading | Concurrency | Baseline | C++ | Improvement | |-------------|----------|------|-------------| | 32 | 1192 | 1473 | 19% | | 16 | 1241 | 1633 | 24% | | 8 | 1230 | 1640 | 25% | | 4 | 1305 | 1548 | 16% | | 2 | 1379 | 1547 | 11% | | 1 | 1277 | 1571 | 19% | <img width="682" height="418" alt="Screenshot 2025-07-25 at 5 05 32 PM" src="https://github.com/user-attachments/assets/41f33bb6-76f7-46de-b5e3-8bc035b5fe93" /> The performance of multiprocessing stays roughly same.
diff --git a/src/spdl/io/_array.py b/src/spdl/io/_array.py
@@ -9,14 +9,9 @@
     "load_npz",
     "NpzFile",
 ]
-import ast
-import struct
 from collections.abc import Iterator, Mapping
-from dataclasses import dataclass
-from typing import Any
 
 import numpy as np
-from numpy.lib.format import MAGIC_LEN, MAGIC_PREFIX
 from numpy.typing import NDArray
 
 # Importing `spdl.io.lib` instead of `spdl.io.lilb._archive`
@@ -26,35 +21,8 @@
 # pyre-strict
 
 
-@dataclass
-class _ArrayInterface:
-    shape: tuple[int, ...]  # pyre-ignore: [35]
-    typestr: str  # pyre-ignore: [35]
-    data: memoryview  # pyre-ignore: [35]
-    offset: int = 0  # pyre-ignore: [35]
-    version: int = 3  # pyre-ignore: [35]
-
-    @property
-    def __array_interface__(self) -> dict[str, Any]:
-        return {
-            "shape": self.shape,
-            "typestr": self.typestr,
-            "data": self.data,
-            "offset": self.offset,
-            "version": self.version,
-        }
-
-
-def _get_header_size_info(version: tuple[int, int]) -> tuple[str, str]:
-    match version:
-        case (1, 0):
-            return ("<H", "latin1")
-        case (2, 0):
-            return ("<I", "latin1")
-        case (3, 0):
-            return ("<I", "utf8")
-        case _:
-            raise ValueError(f"Unexpected version {version}.")
+def _get_pointer(data: bytes) -> int:
+    return np.frombuffer(data, dtype=np.byte).ctypes.data
 
 
 def load_npy(
@@ -111,50 +79,8 @@ def load_npy(
        `creates a new array <https://github.com/numpy/numpy/blob/v2.2.0/numpy/_core/records.py#L935-L939>`_.
 
     """
-    if len(data) < MAGIC_LEN:
-        raise ValueError("The input data is too short.")
-
-    view = memoryview(data)
-    magic_str = view[:MAGIC_LEN].tobytes()
-    if not magic_str.startswith(MAGIC_PREFIX):
-        raise ValueError(rf"Expected the data to start with {MAGIC_PREFIX}.")
-
-    major, minor = magic_str[-2:]
-    hlength_type, encoding = _get_header_size_info((major, minor))
-
-    info_length_size = struct.calcsize(hlength_type)
-    info_start = MAGIC_LEN + info_length_size
-
-    if len(data) < info_start:
-        raise ValueError("Failed to parse info. The input data is invalid.")
-    info_length_str = data[MAGIC_LEN:info_start]
-    info_length = struct.unpack(hlength_type, info_length_str)[0]
-
-    data_start = info_start + info_length
-    if len(data) < data_start:
-        raise ValueError(
-            "Failed to parse data. The recorded data size exceeds the provided data size."
-        )
-    info_str = view[info_start:data_start].tobytes()
-
-    info = ast.literal_eval(info_str.decode(encoding))
-
-    if info.get("fortran_order"):
-        raise ValueError(
-            "Array saved with `format_order=True is not supported. Please use `numpy.load`."
-        )
-
-    # TODO: Try `numpy.frombuffer``
-    # https://github.com/numpy/numpy/blob/e20317a43d3714f9085ad959f68c1ba6bc998fcd/numpy/_core/src/multiarray/ctors.c#L3711
-    aif = _ArrayInterface(
-        shape=info["shape"],
-        typestr=info["descr"],
-        data=view,
-        offset=data_start,
-        version=2,
-    )
-
-    return np.array(aif, copy=copy)
+    buffer = _libspdl._archive.load_npy(_get_pointer(data), len(data))
+    return np.array(buffer, copy=copy)
 
 
 class NpzFile(Mapping):
@@ -168,7 +94,8 @@ class NpzFile(Mapping):
     """
 
     def __init__(self, data: bytes, meta: dict[str, tuple[int, int, int, int]]) -> None:
-        self._data = memoryview(data)  # pyre-ignore
+        self._data: int = _get_pointer(data)
+        self._len: int = len(data)
         self._meta = meta
         self.files: list[str] = [f.removesuffix(".npy") for f in meta]
 
@@ -192,16 +119,18 @@ def __getitem__(self, key: str) -> NDArray:
         else:
             raise KeyError(f"{key} is not a file in the archive")
 
-        start, compressed_size, uncompressed_size, compression_method = self._meta[key]
+        offset, compressed_size, uncompressed_size, compression_method = self._meta[key]
         match compression_method:
             case 0:
-                return load_npy(self._data[start : start + compressed_size])
+                buffer = _libspdl._archive.load_npy(
+                    self._data, size=compressed_size, offset=offset
+                )
+                return np.array(buffer, copy=False)
             case 8:
-                return load_npy(
-                    _libspdl._archive.inflate(
-                        self._data.obj, start, compressed_size, uncompressed_size
-                    )
+                buffer = _libspdl._archive.load_npy_compressed(
+                    self._data, offset, compressed_size, uncompressed_size
                 )
+                return np.array(buffer, copy=False)
             case _:
                 raise ValueError(
                     "Compression method other than DEFLATE is not supported."
diff --git a/src/spdl/io/lib/archive/CMakeLists.txt b/src/spdl/io/lib/archive/CMakeLists.txt
@@ -11,7 +11,7 @@ message(STATUS "########################################")
 set(name _archive)
 message(STATUS "Building ${name}")
 
-set(srcs register.cpp zip_impl.cpp)
+set(srcs register.cpp zip_impl.cpp numpy_support.cpp)
 set(deps ZLIB::ZLIB fmt::fmt glog::glog)
 set(nb_options
   STABLE_ABI
diff --git a/src/spdl/io/lib/archive/numpy_support.cpp b/src/spdl/io/lib/archive/numpy_support.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "numpy_support.h"
+#include "zip_impl.h"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace spdl::archive {
+
+//////////////////////////////////////////////////////////////////////////////
+// load_npy
+//////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+void check_magic(const char** data, size_t* size) {
+  const static char* prefix = "\x93NUMPY";
+  const static size_t len = std::strlen(prefix);
+  if (*size < len) {
+    throw std::runtime_error(
+        "Failed to parse the magic prefix. (data too short)");
+  }
+  if (std::strncmp(*data, prefix, len) != 0) {
+    throw std::runtime_error(
+        "The data must start with the prefix '\\x93NUMPY'");
+  }
+  *data = (*data) + len;
+  *size = (*size) - len;
+}
+
+std::string_view extract_header(const char** data, size_t* size) {
+  auto s = (*size);
+  auto* d = (*data);
+  if (s < 2) {
+    throw std::runtime_error("Failed to parse version number.");
+  }
+  int major = static_cast<int>(d[0]);
+  // int minor = static_cast<int>(data[1]);
+  s -= 2;
+  d += 2;
+  switch (major) {
+    case 1: {
+      // The next two bytes are header length in little endien.
+      if (s < 2) {
+        throw std::runtime_error("Failed to parse header length.");
+      }
+      unsigned short len = (*d);
+      len += (unsigned short)(*(d + 1)) << 8;
+      s -= 2;
+      d += 2;
+      if (s < len) {
+        throw std::runtime_error("Failed to parse header");
+      }
+      std::string_view header{d, len};
+      *data = d + len;
+      *size = s - len;
+      return header;
+    }
+    case 2:
+      [[fallthrough]];
+    case 3: {
+      // The next four bytes are header length.
+      if (s < 4) {
+        throw std::runtime_error("Failed to parse header length.");
+      }
+      size_t len;
+      {
+        int l = (int)*d;
+        l += (int)(*(d + 1) << 8);
+        l += (int)(*(d + 2) << 16);
+        l += (int)(*(d + 3) << 24);
+        if (l <= 0) {
+          throw std::runtime_error(
+              "Invalid data. The header length must be greater than 0.");
+        }
+        len = l;
+      }
+      s -= 4;
+      d += 4;
+      if (s < len) {
+        throw std::runtime_error("Failed to parse header");
+      }
+      std::string_view header{d, len};
+      *data = d + len;
+      *size = s - len;
+      return header;
+    }
+    default:
+      throw std::runtime_error(
+          "Unexpected format version. Only 1, 2 and 3 are supported.");
+  }
+}
+
+NPYArray parse_header(const std::string_view header) {
+  // NPY header is a string expression of Python dictionary with the following
+  // keys See:
+  // https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html
+  //
+  // - "descr": Format description. e.g. "'<i8'", "'<f4'"
+  // - "fortran_order": "True" or "False".
+  // - "shape": Tuple of int. e.g.  "()", "(3, 4, 5)"
+  NPYArray ret;
+  {
+    size_t pos = header.find("'descr':");
+    if (pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'descr'`.");
+    }
+    pos = header.find('\'', pos + 7);
+    if (pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'descr'`.");
+    }
+    size_t end_pos = header.find('\'', pos + 1);
+    if (end_pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'descr'`.");
+    }
+    ret.descr = header.substr(pos + 1, end_pos - pos - 1);
+  }
+  {
+    size_t pos = header.find("'shape':");
+    if (pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'shape'`.");
+    }
+    pos = header.find('(', pos);
+    if (pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'shape'`.");
+    }
+    size_t end_pos = header.find(')', pos);
+    if (end_pos == std::string::npos) {
+      throw std::runtime_error("Failed to parse header `'shape'`.");
+    }
+    std::string shape_str(header.substr(pos + 1, end_pos - pos - 1));
+    std::istringstream shape_stream(shape_str);
+    std::string number;
+    while (std::getline(shape_stream, number, ',')) {
+      number.erase(
+          std::remove_if(number.begin(), number.end(), ::isspace),
+          number.end());
+      if (!number.empty()) {
+        ret.shape.push_back(std::stoi(number));
+      }
+    }
+  }
+  {
+    const std::string key = "'fortran_order':";
+    size_t pos = header.find(key);
+    if (pos != std::string::npos) {
+      pos += key.length();
+      while (pos < header.size() && std::isspace(header[pos])) {
+        ++pos;
+      }
+      if (pos < header.size() && header[pos] == 'T') {
+        ret.fortran_order = true;
+      } else if (pos < header.size() && header[pos] == 'F') {
+        ret.fortran_order = false;
+      }
+    }
+  }
+  return ret;
+}
+} // namespace
+
+NPYArray load_npy(const char* data, size_t size) {
+  check_magic(&data, &size);
+  auto header = extract_header(&data, &size);
+  auto array = parse_header(header);
+  array.data = (void*)data;
+  return array;
+}
+
+NPYArray load_npy_compressed(
+    const char* data,
+    uint32_t compressed_size,
+    uint32_t uncompressed_size) {
+  auto buffer = std::make_unique<char[]>(uncompressed_size);
+  zip::inflate(data, compressed_size, buffer.get(), uncompressed_size);
+  auto ret = load_npy(buffer.get(), uncompressed_size);
+  ret.buffer = std::move(buffer);
+  return ret;
+}
+
+} // namespace spdl::archive
diff --git a/src/spdl/io/lib/archive/numpy_support.h b/src/spdl/io/lib/archive/numpy_support.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace spdl::archive {
+
+struct NPYArray {
+  std::string descr{};
+  bool fortran_order = false;
+  std::vector<size_t> shape{};
+
+  // Pointer to the array data (not owned)
+  void* data = nullptr;
+
+  // Owned data (optional)
+  std::unique_ptr<char[]> buffer{};
+};
+
+NPYArray load_npy(const char*, size_t);
+NPYArray load_npy_compressed(const char*, uint32_t, uint32_t);
+
+} // namespace spdl::archive
diff --git a/src/spdl/io/lib/archive/register.cpp b/src/spdl/io/lib/archive/register.cpp
diff --git a/tests/spdl_unittest/io/array_test.py b/tests/spdl_unittest/io/array_test.py