pytorch · JacobSzwejbka · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025
@@ -99,11 +99,13 @@ announce_configured_options(CCACHE_PROGRAM)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# Force logging to always be enabled for this build.
 if(NOT EXECUTORCH_ENABLE_LOGGING)
-  # Avoid pulling in the logging strings, which can be large. Note that this
-  # will set the compiler flag for all targets in this directory, and for all
-  # subdirectories included after this point.
-  add_definitions(-DET_LOG_ENABLED=0)
+  message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.")
+  set(EXECUTORCH_ENABLE_LOGGING
+      ON
+      CACHE BOOL "Build with ET_LOG_ENABLED" FORCE
+  )
 endif()
 
 add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
@@ -16,8 +16,10 @@ namespace aoti {
 
 namespace internal {
 // Global storage for tensor metadata
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_sizes;
+AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
+    tensor_to_strides;
 } // namespace internal
 
 extern "C" {
@@ -74,9 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it =
-        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
-            .first;
+    it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -122,8 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
-             .first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -200,6 +199,69 @@ void cleanup_tensor_metadata() {
   internal::tensor_to_strides.clear();
 }
 
+void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
+  (void)tensor;
+  (void)ret_size;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
+  (void)self;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
+  (void)orig_handle;
+  (void)new_handle;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  (void)data_ptr;
+  (void)ndim;
+  (void)sizes;
+  (void)strides;
+  (void)storage_offset;
+  (void)dtype;
+  (void)device_type;
+  (void)device_index;
+  (void)ret_new_tensor;
+  throw std::runtime_error("Not implemented");
+  return Error::Internal;
+}
+
 } // extern "C"
 
 } // namespace aoti

diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
@@ -15,6 +15,13 @@
 #include <unordered_map>
 #include <vector>
 
+#if defined(BUILDING_CUDA_BACKEND)
+#include <executorch/backends/cuda/runtime/export.h>
+#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
+#else
+#define AOTI_SHIM_EXPORT
+#endif
+
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -23,56 +30,89 @@ namespace aoti {
 using executorch::runtime::Error;
 using executorch::runtime::etensor::Tensor;
 
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
 extern "C" {
 
 // Common AOTI type aliases
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// Global storage for tensor metadata
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
-
 // Attribute-related operations (memory-irrelevant)
-AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
 
-AOTITorchError aoti_torch_get_storage_offset(
-    Tensor* tensor,
-    int64_t* ret_storage_offset);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);
 
-AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
 
-AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
 
-AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
 
-AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_index(
-    Tensor* tensor,
-    int32_t* ret_device_index);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
 
-AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
-int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_layout_strided();
-int32_t aoti_torch_dtype_float32();
-int32_t aoti_torch_dtype_bfloat16();
-int32_t aoti_torch_dtype_int8();
-int32_t aoti_torch_dtype_int16();
-int32_t aoti_torch_dtype_int32();
-int32_t aoti_torch_dtype_int64();
+AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
-size_t aoti_torch_dtype_element_size(int32_t dtype);
+AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
 
 // Autograd mode functions
-int32_t aoti_torch_grad_mode_is_enabled();
-void aoti_torch_grad_mode_set_enabled(bool enabled);
+AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
+AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 
 // Cleanup functions for clearing global state
-void cleanup_tensor_metadata();
+AOTI_SHIM_EXPORT void cleanup_tensor_metadata();
+
+AOTI_SHIM_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);
+
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
+
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data_ptr,
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor);
 
 } // extern "C"
 

@@ -43,8 +43,28 @@ set(_aoti_cuda_sources
     runtime/shims/cuda_guard.cpp
     runtime/shims/int4mm.cu
     runtime/platform/platform.cpp
+    ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
-add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
+if(MSVC)
+  add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
+  # Define export macros for Windows DLL
+  target_compile_definitions(
+    aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND
+  )
+  # Ensure proper DLL import/export library naming on Windows with
+  # config-specific paths
+  set_target_properties(
+    aoti_cuda
+    PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via
+                                              # AOTI_CUDA_EXPORT
+               RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
+               LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+               ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
+  )
+else()
+  add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+endif()
 target_include_directories(
   aoti_cuda
   PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
@@ -64,11 +84,15 @@ target_link_options(
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
-executorch_target_link_options_shared_lib(aoti_cuda)
+
+# Only apply shared lib options on non-Windows platforms
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
 
 if(BUILD_TESTING)
   # Add runtime
@@ -82,5 +106,7 @@ endif()
 install(
   TARGETS aoti_cuda
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
 )
diff --git a/backends/cuda/runtime/CudaBackend.h b/backends/cuda/runtime/CudaBackend.h
@@ -0,0 +1,46 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/backends/cuda/runtime/export.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+namespace executorch::backends::cuda {
+
+class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ public:
+  /**
+   * Check if the CUDA backend is available.
+   */
+  bool is_available() const override;
+
+  /**
+   * Initialize the backend with the given context and compile specs.
+   * Called once per loaded binary blob.
+   */
+  ::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
+      ::executorch::runtime::BackendInitContext& context,
+      ::executorch::runtime::FreeableBuffer* processed,
+      ::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec>
+          compile_specs) const override;
+
+  /**
+   * Execute the backend with the given context and arguments.
+   * Called once per execution.
+   */
+  ::executorch::runtime::Error execute(
+      ::executorch::runtime::BackendExecutionContext& context,
+      ::executorch::runtime::DelegateHandle* handle,
+      ::executorch::runtime::Span<::executorch::runtime::EValue*> args)
+      const override;
+
+  /**
+   * Destroy the backend handle and clean up resources.
+   */
+  void destroy(::executorch::runtime::DelegateHandle* handle) const override;
+};
+
+} // namespace executorch::backends::cuda