Remove shared_array!

nnethercote · nnethercote · commit a3a1ef686ff8 · 2025-11-25T11:14:29.000+11:00
It's wildly unsound, as far as I can tell.
- The use of `UnsafeCell`/`unsafe impl Send`/`unsafe impl Sync` is very
  dodgy.
- The use of `MaybeUninit&lt;[T; N]&gt;` is completely wrong. It should be
  `[&lt;MaybeUninit&lt;T&gt;; N&gt;]`, which allows for partial initialization.
- The examples don't use `write`/`assume_init` with `MaybeUninit`, which
  is UB.

The alternative is to use `#[address_space(shared) static mut
[MaybeUninit&lt;T&gt;; N]` directly instead. That avoids all the unsoundness,
and is more ergonomic because you don't have to work with a raw pointer,
and is clearer because details aren't hidden within the macro.

I moved (with some modifications) the comments on `shared_array` to the
`address_space` proc macro because there were some useful details in
there.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/cuda_std/CHANGELOG.md b/crates/cuda_std/CHANGELOG.md
@@ -7,6 +7,7 @@ Notable changes to this project will be documented in this file.
 - Added warp shuffles, matches, reductions, and votes in the `warp` module.
 - Added `activemask` in the `warp` module to query a mask of the active threads.
 - Fixed `lane_id` generating invalid ptx.
+- Removed `shared_array!` due to unsoundness.
 
 ## 0.2.2 - 2/7/22
 
diff --git a/crates/cuda_std/src/shared.rs b/crates/cuda_std/src/shared.rs
@@ -1,83 +1,9 @@
-//! Static and Dynamic shared memory handling.
+//! Dynamic shared memory handling.
+//!
+//! Static shared memory is done via `#[address_space(shared)] static mut ...;`.
 
 use crate::gpu_only;
 
-/// Statically allocates a buffer large enough for `len` elements of `array_type`,
-/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
-/// be a constant expression.
-///
-/// Note that this allocates the memory __statically__, it expands to a static in the
-/// `shared` address space. Therefore, calling this macro multiple times in a loop will
-/// always yield the same data. However, separate invocations of the macro will yield
-/// different buffers.
-///
-/// The data is uninitialized by default, therefore, you must be careful to not read the
-/// data before it is written to. The semantics of what "uninitialized" actually means
-/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
-/// not well known, so even if the type is valid for any backing memory, make sure to
-/// not read uninitialized data.
-///
-/// # Safety
-///
-/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
-/// prove, therefore the burden of correctness is on the user. Some of the things you
-/// must ensure in your usage of shared memory are:
-///
-///  - Shared memory is only shared across __thread blocks__, not the entire device,
-///    therefore it is unsound to try and rely on sharing data across more than one
-///    block.
-///   - You must write to the shared buffer before reading from it as the data is
-///     uninitialized by default.
-///   - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
-///     relying on the results of other threads, this ensures every thread has reached
-///     that point before going on. For example, reading another thread's data after
-///     writing to the buffer.
-///   - No access may be out of bounds, this usually means making sure the amount of
-///     threads and their dimensions are correct.
-///
-/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
-/// shared memory are right.
-///
-/// # Examples
-///
-/// ```no_run
-/// # use cuda_std::kernel;
-/// # use cuda_std::shared_array;
-/// # use cuda_std::thread;
-/// ##[kernel]
-/// pub unsafe fn reverse_array(d: *mut i32, n: usize) {
-///    let s = shared_array![i32; 64];
-///    let t = thread::thread_idx_x() as usize;
-///    let tr = n - t - 1;
-///    *s.add(t) = *d.add(t);
-///    thread::sync_threads();
-///    *d.add(t) = *s.add(tr);
-/// }
-/// ```
-#[macro_export]
-macro_rules! shared_array {
-    ($array_type:ty; $len:expr) => {{
-        #[$crate::gpu_only]
-        #[inline(always)]
-        fn shared_array() -> *mut $array_type {
-            use ::core::{cell::UnsafeCell, mem::MaybeUninit};
-            struct SyncWrapper(UnsafeCell<MaybeUninit<[$array_type; $len]>>);
-            // SAFETY: it is up to the user to verify sound shared memory usage, we cannot
-            // fundamentally check it for soundness.
-            unsafe impl Send for SyncWrapper {}
-            // SAFETY: see above
-            unsafe impl Sync for SyncWrapper {}
-
-            // the initializer is discarded when declaring shared globals, so it is unimportant.
-            #[$crate::address_space(shared)]
-            static SHARED: SyncWrapper = SyncWrapper(UnsafeCell::new(MaybeUninit::uninit()));
-
-            SHARED.0.get() as *mut $array_type
-        }
-        shared_array()
-    }};
-}
-
 /// Gets a pointer to the dynamic shared memory that was allocated by the caller of the kernel. The
 /// data is left uninitialized.
 ///
diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
@@ -205,12 +205,61 @@ pub fn externally_visible(
 }
 
 /// Notifies the codegen to put a `static`/`static mut` inside of a specific memory address space.
-/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle address space placement
-/// implicitly. **Improper use of this macro could yield weird or undefined behavior**.
+/// This is mostly for internal use and/or advanced users, as the codegen and `cuda_std` handle
+/// address space placement implicitly. **Improper use of this macro could yield weird or undefined
+/// behavior**.
 ///
-/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or `local`.
+/// This macro takes a single argument which can either be `global`, `shared`, `constant`, or
+/// `local`.
 ///
 /// This macro does nothing on the CPU.
+///
+/// # Shared memory
+///
+/// The item `#[address_space(shared) static mut FOO: [MaybeUninit<T>; N];` statically allocates a
+/// buffer large enough for `N` elements of type `T`, yielding an uninitialized array in shared
+/// memory.
+///
+/// Note that this allocates the memory __statically__, i.e. it expands to a static in the `shared`
+/// address space. Therefore, calling this macro multiple times in a loop will always yield the
+/// same data. However, separate invocations of the macro will yield different buffers.
+///
+/// Because the data is uninitialized by default, the type within the array must be `MaybeUninit`,
+/// and uses must follow the usual rules of `MaybeUninit`, such as using `write`/`assume_init`.
+/// Using a non-`MaybeUninit` type is undefined behaviour.
+///
+/// # Safety
+///
+/// Shared memory usage is fundamentally unsafe and much of the burden of correctness is on the
+/// user. For example:
+/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
+///   unsound to rely on sharing data across more than one block.
+/// - You must write to the shared buffer before reading from it as the data is uninitialized by
+///   default.
+/// - `cuda_std::thread::sync_threads` must be called before relying on the results of other
+///   threads. This ensures every thread has reached that point before going on. For example, when
+///   reading another thread's data after writing to the buffer.
+///
+/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
+/// shared memory are right.
+///
+/// # Examples
+///
+/// ```ignore
+/// use core::mem::MaybeUninit;
+/// use cuda_std::*;
+///
+/// ##[kernel]
+/// pub unsafe fn reverse_array(d: *mut u32, n: usize) {
+///     ##[address_space(shared)]
+///     static mut S: [MaybeUninit<u32>; 64] = [const { MaybeUninit::uninit() }; 64];
+///     let i = thread::thread_idx_x() as usize;
+///     let ir = n - i - 1;
+///     unsafe { S[i].write(*d.add(i)); };
+///     thread::sync_threads();
+///     unsafe { *d.add(i) = S[ir].assume_init(); }
+/// }
+/// ```
 #[proc_macro_attribute]
 pub fn address_space(attr: proc_macro::TokenStream, item: proc_macro::TokenStream) -> TokenStream {
     let mut global = syn::parse_macro_input!(item as syn::ItemStatic);