Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
118cb0e
dont build custom or quantized ops on msvc cuda
JacobSzwejbka Oct 18, 2025
cd93bda
dll attempt 1
JacobSzwejbka Oct 18, 2025
0af3f62
add export macro to header definitions
JacobSzwejbka Oct 18, 2025
3368248
voxtral cmake
JacobSzwejbka Oct 18, 2025
450b6fb
cmake dll install changes
JacobSzwejbka Oct 18, 2025
0bbcd71
more cmake stuff to try
JacobSzwejbka Oct 18, 2025
efcdfa1
hacky work to get backend registration
JacobSzwejbka Oct 18, 2025
0b000cd
more cmake checks
JacobSzwejbka Oct 18, 2025
6fca29a
static init hacks
JacobSzwejbka Oct 18, 2025
afadb48
add log
JacobSzwejbka Oct 18, 2025
357bf9f
more hacks
JacobSzwejbka Oct 18, 2025
d2d8acf
hacks
JacobSzwejbka Oct 18, 2025
3f56f13
config specific lib and bin locations
JacobSzwejbka Oct 18, 2025
b3b654d
hacks
JacobSzwejbka Oct 18, 2025
cac7849
hacks
JacobSzwejbka Oct 18, 2025
6040e3a
defs hack
JacobSzwejbka Oct 19, 2025
25667f1
remove cuda ifdef
JacobSzwejbka Oct 19, 2025
1ae9ded
missing semicolon
JacobSzwejbka Oct 19, 2025
53dc564
trying something else
JacobSzwejbka Oct 19, 2025
daae0c0
var name
JacobSzwejbka Oct 19, 2025
b9d1a39
new approach
JacobSzwejbka Oct 19, 2025
c0915f9
test
JacobSzwejbka Oct 19, 2025
971bf49
test
JacobSzwejbka Oct 19, 2025
d1b26ce
remove const
JacobSzwejbka Oct 19, 2025
9dc28d9
hacks
JacobSzwejbka Oct 19, 2025
8b46964
move vectors
JacobSzwejbka Oct 19, 2025
cbacd6c
lint
JacobSzwejbka Oct 19, 2025
bec6c5f
stub
JacobSzwejbka Oct 19, 2025
d8fafd3
stubbed shims
JacobSzwejbka Oct 19, 2025
d0b9034
undo internal change
JacobSzwejbka Oct 19, 2025
e44d74b
we dont have warn level
JacobSzwejbka Oct 19, 2025
ae61575
hacky mmap change
JacobSzwejbka Oct 19, 2025
1d661c4
more hacky mmap edits
JacobSzwejbka Oct 19, 2025
0ab4746
minmax issue
JacobSzwejbka Oct 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,13 @@ announce_configured_options(CCACHE_PROGRAM)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Force logging to always be enabled for this build.
if(NOT EXECUTORCH_ENABLE_LOGGING)
# Avoid pulling in the logging strings, which can be large. Note that this
# will set the compiler flag for all targets in this directory, and for all
# subdirectories included after this point.
add_definitions(-DET_LOG_ENABLED=0)
message(STATUS "EXECUTORCH_ENABLE_LOGGING was OFF; forcing it to ON.")
set(EXECUTORCH_ENABLE_LOGGING
ON
CACHE BOOL "Build with ET_LOG_ENABLED" FORCE
)
endif()

add_definitions(-DET_MIN_LOG_LEVEL=${ET_MIN_LOG_LEVEL})
Expand Down
76 changes: 69 additions & 7 deletions backends/aoti/common_shims.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ namespace aoti {

namespace internal {
// Global storage for tensor metadata
std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
tensor_to_sizes;
AOTI_SHIM_EXPORT std::unordered_map<Tensor*, std::vector<int64_t>>
tensor_to_strides;
} // namespace internal

extern "C" {
Expand Down Expand Up @@ -74,9 +76,7 @@ AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
for (int i = 0; i < tensor->dim(); i++) {
strides[i] = tensor_strides[i];
}
it =
internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
.first;
it = internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides)).first;
}

// For 0D tensors, data() returns nullptr on empty vectors, but we need to
Expand Down Expand Up @@ -122,8 +122,7 @@ AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
for (int i = 0; i < tensor->dim(); i++) {
sizes[i] = tensor_sizes[i];
}
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
.first;
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes)).first;
}

// For 0D tensors, data() returns nullptr on empty vectors, but we need to
Expand Down Expand Up @@ -200,6 +199,69 @@ void cleanup_tensor_metadata() {
internal::tensor_to_strides.clear();
}

void aoti_torch_warn(
const char* func,
const char* file,
uint32_t line,
const char* msg) {
ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
}

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
(void)tensor;
(void)ret_size;
throw std::runtime_error("Not implemented");
return Error::Internal;
}

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor) {
(void)self;
(void)ret_new_tensor;
throw std::runtime_error("Not implemented");
return Error::Internal;
}

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor) {
(void)self;
(void)ret_new_tensor;
throw std::runtime_error("Not implemented");
return Error::Internal;
}

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle) {
(void)orig_handle;
(void)new_handle;
throw std::runtime_error("Not implemented");
return Error::Internal;
}

AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
void* data_ptr,
int64_t ndim,
const int64_t* sizes,
const int64_t* strides,
int64_t storage_offset,
int32_t dtype,
int32_t device_type,
int32_t device_index,
Tensor** ret_new_tensor) {
(void)data_ptr;
(void)ndim;
(void)sizes;
(void)strides;
(void)storage_offset;
(void)dtype;
(void)device_type;
(void)device_index;
(void)ret_new_tensor;
throw std::runtime_error("Not implemented");
return Error::Internal;
}

} // extern "C"

} // namespace aoti
Expand Down
96 changes: 68 additions & 28 deletions backends/aoti/common_shims.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
#include <unordered_map>
#include <vector>

#if defined(BUILDING_CUDA_BACKEND)
#include <executorch/backends/cuda/runtime/export.h>
#define AOTI_SHIM_EXPORT AOTI_CUDA_EXPORT
#else
#define AOTI_SHIM_EXPORT
#endif

namespace executorch {
namespace backends {
namespace aoti {
Expand All @@ -23,56 +30,89 @@ namespace aoti {
using executorch::runtime::Error;
using executorch::runtime::etensor::Tensor;

// Global storage for tensor metadata
extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;

extern "C" {

// Common AOTI type aliases
using AOTIRuntimeError = Error;
using AOTITorchError = Error;

// Global storage for tensor metadata
extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;

// Attribute-related operations (memory-irrelevant)
AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);

AOTITorchError aoti_torch_get_storage_offset(
Tensor* tensor,
int64_t* ret_storage_offset);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_storage_offset(Tensor* tensor, int64_t* ret_storage_offset);

AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);

AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);

AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);

AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);

AOTITorchError aoti_torch_get_device_index(
Tensor* tensor,
int32_t* ret_device_index);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);

AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);

// Utility functions for device and layout information
int32_t aoti_torch_device_type_cpu();
int32_t aoti_torch_layout_strided();
int32_t aoti_torch_dtype_float32();
int32_t aoti_torch_dtype_bfloat16();
int32_t aoti_torch_dtype_int8();
int32_t aoti_torch_dtype_int16();
int32_t aoti_torch_dtype_int32();
int32_t aoti_torch_dtype_int64();
AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int64();

// Dtype utility function needed by Metal backend
size_t aoti_torch_dtype_element_size(int32_t dtype);
AOTI_SHIM_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);

// Autograd mode functions
int32_t aoti_torch_grad_mode_is_enabled();
void aoti_torch_grad_mode_set_enabled(bool enabled);
AOTI_SHIM_EXPORT int32_t aoti_torch_grad_mode_is_enabled();
AOTI_SHIM_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);

// Cleanup functions for clearing global state
void cleanup_tensor_metadata();
AOTI_SHIM_EXPORT void cleanup_tensor_metadata();

AOTI_SHIM_EXPORT void aoti_torch_warn(
const char* func,
const char* file,
uint32_t line,
const char* msg);

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_clone_preserve_strides(Tensor* self, Tensor** ret_new_tensor);

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_clone(Tensor* self, Tensor** ret_new_tensor);

AOTI_SHIM_EXPORT AOTITorchError
aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);

AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
void* data_ptr,
int64_t ndim,
const int64_t* sizes,
const int64_t* strides,
int64_t storage_offset,
int32_t dtype,
int32_t device_type,
int32_t device_index,
Tensor** ret_new_tensor);

} // extern "C"

Expand Down
34 changes: 30 additions & 4 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,28 @@ set(_aoti_cuda_sources
runtime/shims/cuda_guard.cpp
runtime/shims/int4mm.cu
runtime/platform/platform.cpp
${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
)
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
# Build as SHARED library (.dll) on Windows MSVC, otherwise STATIC
if(MSVC)
add_library(aoti_cuda SHARED ${_aoti_cuda_sources})
# Define export macros for Windows DLL
target_compile_definitions(
aoti_cuda PRIVATE EXPORT_AOTI_FUNCTIONS BUILDING_CUDA_BACKEND
)
# Ensure proper DLL import/export library naming on Windows with
# config-specific paths
set_target_properties(
aoti_cuda
PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF # We use explicit exports via
# AOTI_CUDA_EXPORT
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/$<CONFIG>
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/$<CONFIG>
)
else()
add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
endif()
target_include_directories(
aoti_cuda
PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
Expand All @@ -64,11 +84,15 @@ target_link_options(

# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
target_link_libraries(
aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
aoti_cuda PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS}
)
# If you need other CUDA libraries, link them similarly:
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
executorch_target_link_options_shared_lib(aoti_cuda)

# Only apply shared lib options on non-Windows platforms
if(NOT MSVC)
executorch_target_link_options_shared_lib(aoti_cuda)
endif()

if(BUILD_TESTING)
# Add runtime
Expand All @@ -82,5 +106,7 @@ endif()
install(
TARGETS aoti_cuda
EXPORT ExecuTorchTargets
DESTINATION lib
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
RUNTIME DESTINATION bin
)
46 changes: 46 additions & 0 deletions backends/cuda/runtime/CudaBackend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#pragma once

#include <executorch/backends/cuda/runtime/export.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>

namespace executorch::backends::cuda {

class AOTI_CUDA_EXPORT ET_EXPERIMENTAL CudaBackend final
: public ::executorch::runtime::BackendInterface {
public:
/**
* Check if the CUDA backend is available.
*/
bool is_available() const override;

/**
* Initialize the backend with the given context and compile specs.
* Called once per loaded binary blob.
*/
::executorch::runtime::Result<::executorch::runtime::DelegateHandle*> init(
::executorch::runtime::BackendInitContext& context,
::executorch::runtime::FreeableBuffer* processed,
::executorch::runtime::ArrayRef<::executorch::runtime::CompileSpec>
compile_specs) const override;

/**
* Execute the backend with the given context and arguments.
* Called once per execution.
*/
::executorch::runtime::Error execute(
::executorch::runtime::BackendExecutionContext& context,
::executorch::runtime::DelegateHandle* handle,
::executorch::runtime::Span<::executorch::runtime::EValue*> args)
const override;

/**
* Destroy the backend handle and clean up resources.
*/
void destroy(::executorch::runtime::DelegateHandle* handle) const override;
};

} // namespace executorch::backends::cuda
Loading
Loading