60
60
#include < stdlib.h>
61
61
#include < string>
62
62
#include < vector>
63
+ #include < cufile.h>
64
+ #ifdef _WIN32
65
+ #else
66
+ #include < fcntl.h>
67
+ #include < unistd.h>
68
+ #endif
63
69
64
70
static_assert (sizeof (half) == sizeof (ggml_fp16_t ), " wrong fp16 size" );
65
71
@@ -3410,6 +3416,68 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
3410
3416
GGML_UNUSED (reg);
3411
3417
}
3412
3418
3419
+ static bool ggml_backend_cuda_buffer_load_tensor (ggml_backend_buffer_t buffer, ggml_tensor * tensor, const char * path, size_t file_offset, size_t tensor_offset, size_t size) {
3420
+ #ifdef _WIN32
3421
+ GGML_UNUSED (buffer);
3422
+ GGML_UNUSED (tensor);
3423
+ GGML_UNUSED (path);
3424
+ GGML_UNUSED (file_offset);
3425
+ GGML_UNUSED (tensor_offset);
3426
+ GGML_UNUSED (size);
3427
+ return false ;
3428
+ #else
3429
+ static bool initialized = false ;
3430
+ static bool use_cufile = false ;
3431
+ if (!initialized) {
3432
+ CUfileError_t err = cuFileDriverOpen ();
3433
+ initialized = true ;
3434
+ if (err.err != CU_FILE_SUCCESS) {
3435
+ use_cufile = false ;
3436
+ return false ;
3437
+ }
3438
+ CUfileDrvProps_t props;
3439
+ err = cuFileDriverGetProperties (&props);
3440
+ if (err.err != CU_FILE_SUCCESS) {
3441
+ use_cufile = false ;
3442
+ return false ;
3443
+ }
3444
+ if (props.nvfs .dcontrolflags & (1 << CU_FILE_ALLOW_COMPAT_MODE)) {
3445
+ // do not use CUfile if the driver is in compatibility mode
3446
+ // as we have faster mechanisms in llama-model-loader
3447
+ use_cufile = false ;
3448
+ return false ;
3449
+ }
3450
+ use_cufile = true ;
3451
+ }
3452
+ if (!use_cufile) {
3453
+ return false ;
3454
+ }
3455
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context ;
3456
+ ggml_cuda_set_device (ctx->device );
3457
+
3458
+ int fd = open (path, O_RDONLY | O_DIRECT);
3459
+ if (fd < 0 ) {
3460
+ return false ;
3461
+ }
3462
+ CUfileDescr_t cf_descr;
3463
+ CUfileHandle_t cf_handle;
3464
+ memset ((void *)&cf_descr, 0 , sizeof (CUfileDescr_t));
3465
+ cf_descr.handle .fd = fd;
3466
+ cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
3467
+ CUfileError_t status = cuFileHandleRegister (&cf_handle, &cf_descr);
3468
+ if (status.err != CU_FILE_SUCCESS) {
3469
+ return false ;
3470
+ }
3471
+ ssize_t ret = cuFileRead (cf_handle, (char *)tensor->data , size, file_offset, tensor_offset);
3472
+ if (ret < 0 ) {
3473
+ return false ;
3474
+ }
3475
+ cuFileHandleDeregister (cf_handle);
3476
+ close (fd);
3477
+ return true ;
3478
+ #endif
3479
+ }
3480
+
3413
3481
static void * ggml_backend_cuda_reg_get_proc_address (ggml_backend_reg_t reg, const char * name) {
3414
3482
GGML_UNUSED (reg);
3415
3483
if (strcmp (name, " ggml_backend_split_buffer_type" ) == 0 ) {
@@ -3424,6 +3492,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
3424
3492
if (strcmp (name, " ggml_backend_get_features" ) == 0 ) {
3425
3493
return (void *)ggml_backend_cuda_get_features;
3426
3494
}
3495
+ if (strcmp (name, " ggml_backend_tensor_load" ) == 0 ) {
3496
+ return (void *)ggml_backend_cuda_buffer_load_tensor;
3497
+ }
3427
3498
return nullptr ;
3428
3499
}
3429
3500
0 commit comments