Skip to content

Revert "[libc][NFC] refactor Cortex memcpy code" #149035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion libc/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ add_header_library(
aarch64/inline_memcpy.h
aarch64/inline_memmove.h
aarch64/inline_memset.h
arm/common.h
arm/inline_memcpy.h
generic/aligned_access.h
generic/byte_per_byte.h
Expand Down
52 changes: 0 additions & 52 deletions libc/src/string/memory_utils/arm/common.h

This file was deleted.

195 changes: 98 additions & 97 deletions libc/src/string/memory_utils/arm/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,56 +5,63 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// The functions defined in this file give approximate code size. These sizes
// assume the following configuration options:
// - LIBC_CONF_KEEP_FRAME_POINTER = false
// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
// - LIBC_ADD_NULL_CHECKS = false
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H

#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align

#include <stddef.h> // size_t

// https://libc.llvm.org/compiler_support.html
// Support for [[likely]] / [[unlikely]]
// [X] GCC 12.2
// [X] Clang 12
// [ ] Clang 11
#define LIBC_ATTR_LIKELY [[likely]]
#define LIBC_ATTR_UNLIKELY [[unlikely]]

#if defined(LIBC_COMPILER_IS_CLANG)
#if LIBC_COMPILER_CLANG_VER < 1200
#undef LIBC_ATTR_LIKELY
#undef LIBC_ATTR_UNLIKELY
#define LIBC_ATTR_LIKELY
#define LIBC_ATTR_UNLIKELY
#endif
#endif

namespace LIBC_NAMESPACE_DECL {

namespace {

// Performs a copy of `bytes` byte from `src` to `dst`. This function has the
// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
// free to use whatever instruction is best for the size and assumed access.
template <size_t bytes, AssumeAccess access>
LIBC_INLINE void copy(void *dst, const void *src) {
if constexpr (access == AssumeAccess::kAligned) {
constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
memcpy_inline<bytes>(assume_aligned<alignment>(dst),
assume_aligned<alignment>(src));
} else if constexpr (access == AssumeAccess::kUnknown) {
memcpy_inline<bytes>(dst, src);
} else {
static_assert(false);
}
}
LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);

template <size_t bytes, BlockOp block_op = BlockOp::kFull,
AssumeAccess access = AssumeAccess::kUnknown>
LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
if constexpr (block_op == BlockOp::kFull) {
copy<bytes, access>(dst, src);
} else if constexpr (block_op == BlockOp::kByWord) {
enum Strategy {
ForceWordLdStChain,
AssumeWordAligned,
AssumeUnaligned,
};

template <size_t bytes, Strategy strategy = AssumeUnaligned>
LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
if constexpr (strategy == AssumeUnaligned) {
memcpy_inline<bytes>(assume_aligned<1>(dst), assume_aligned<1>(src));
} else if constexpr (strategy == AssumeWordAligned) {
static_assert(bytes >= kWordSize);
memcpy_inline<bytes>(assume_aligned<kWordSize>(dst),
assume_aligned<kWordSize>(src));
} else if constexpr (strategy == ForceWordLdStChain) {
// We restrict loads/stores to 4 byte to prevent the use of load/store
// multiple (LDM, STM) and load/store double (LDRD, STRD).
// multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
// fault (see notes below) and second, they use more registers which in turn
// adds push/pop instructions in the hot path.
static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
LIBC_LOOP_UNROLL
for (size_t offset = 0; offset < bytes; offset += kWordSize) {
copy<kWordSize, access>(dst + offset, src + offset);
for (size_t i = 0; i < bytes / kWordSize; ++i) {
const size_t offset = i * kWordSize;
memcpy_inline<kWordSize>(dst + offset, src + offset);
}
} else {
static_assert(false, "Invalid BlockOp");
}
// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
// into the load/store instructions.
Expand All @@ -65,27 +72,39 @@ LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
src += bytes;
}

template <size_t bytes, BlockOp block_op, AssumeAccess access>
LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) {
LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
const size_t size) {
LIBC_LOOP_NOUNROLL
for (size_t i = 0; i < size / bytes; ++i)
copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
size %= bytes;
for (size_t i = 0; i < size; ++i)
*dst++ = *src++;
}

[[maybe_unused]] LIBC_INLINE void
copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
template <size_t block_size, Strategy strategy>
LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src,
size_t &size) {
LIBC_LOOP_NOUNROLL
for (size_t i = 0; i < size; ++i)
*dst++ = *src++;
for (size_t i = 0; i < size / block_size; ++i)
copy_and_bump_pointers<block_size, strategy>(dst, src);
// Update `size` once at the end instead of once per iteration.
size %= block_size;
}

LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
cpp::bit_cast<uintptr_t>(b));
}

LIBC_INLINE auto misaligned(CPtr a) {
return distance_to_align_down<kWordSize>(a);
}

} // namespace

// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
// loads/stores. It compiles down to 208 bytes when used through `memcpy` that
// also needs to return the `dst` ptr.
// Note:
// Implementation for Cortex-M0, M0+, M1.
// Notes:
// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
// that also needs to return the `dst` ptr.
// - These cores do not allow for unaligned loads/stores.
// - When `src` and `dst` are coaligned, we start by aligning them and perform
// bulk copies. We let the compiler know the pointers are aligned so it can
// use load/store multiple (LDM, STM). This significantly increase throughput
Expand All @@ -106,18 +125,9 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
if (src_alignment == 0)
LIBC_ATTR_LIKELY {
// Both `src` and `dst` are now word-aligned.
// We first copy by blocks of 64 bytes, the compiler will use 4
// load/store multiple (LDM, STM), each of 4 words. This requires more
// registers so additional push/pop are needed but the speedup is worth
// it.
consume_by_block<64, BlockOp::kFull, AssumeAccess::kAligned>(dst, src,
size);
// Then we use blocks of 4 word load/store.
consume_by_block<16, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src,
size);
// Then we use word by word copy.
consume_by_block<4, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src,
size);
copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size);
copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size);
copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size);
}
else {
// `dst` is aligned but `src` is not.
Expand All @@ -128,7 +138,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
src_alignment == 2
? load_aligned<uint32_t, uint16_t, uint16_t>(src)
: load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
copy<kWordSize, AssumeAccess::kAligned>(dst, &value);
memcpy_inline<kWordSize>(assume_aligned<kWordSize>(dst), &value);
dst += kWordSize;
src += kWordSize;
size -= kWordSize;
Expand All @@ -141,69 +151,56 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
}

// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
// support for unaligned loads and stores. It compiles down to 272 bytes when
// used through `memcpy` that also needs to return the `dst` ptr.
// support for unaligned loads and stores.
// Notes:
// - It compiles down to 266 bytes.
// - `dst` and `src` are not `__restrict` to prevent the compiler from
// reordering loads/stores.
// - We keep state variables to a strict minimum to keep everything in the free
// registers and prevent costly push / pop.
// - If unaligned single loads/stores to normal memory are supported, unaligned
// accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
// STRD) instructions are generally not supported and will still fault so we
// make sure to restrict unrolling to word loads/stores.
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
size_t size) {
if (misaligned(bitwise_or(src, dst)))
LIBC_ATTR_UNLIKELY {
if (size < 8)
LIBC_ATTR_UNLIKELY {
if (size & 1)
copy_block_and_bump_pointers<1>(dst, src);
copy_and_bump_pointers<1>(dst, src);
if (size & 2)
copy_block_and_bump_pointers<2>(dst, src);
copy_and_bump_pointers<2>(dst, src);
if (size & 4)
copy_block_and_bump_pointers<4>(dst, src);
copy_and_bump_pointers<4>(dst, src);
return;
}
if (misaligned(src))
LIBC_ATTR_UNLIKELY {
const size_t offset = distance_to_align_up<kWordSize>(dst);
if (offset & 1)
copy_block_and_bump_pointers<1>(dst, src);
copy_and_bump_pointers<1>(dst, src);
if (offset & 2)
copy_block_and_bump_pointers<2>(dst, src);
copy_and_bump_pointers<2>(dst, src);
size -= offset;
}
}
// `dst` and `src` are not necessarily both aligned at that point but this
// implementation assumes hardware support for unaligned loads and stores so
// it is still fast to perform unrolled word by word copy. Note that wider
// accesses through the use of load/store multiple (LDM, STM) and load/store
// double (LDRD, STRD) instructions are generally not supported and can fault.
// By forcing decomposition of 64 bytes copy into word by word copy, the
// compiler can use the first load to prefetch memory:
// ldr r3, [r1, #64]! <- prefetch next cache line
// str r3, [r0]
// ldr r3, [r1, #0x4]
// str r3, [r0, #0x4]
// ...
// ldr r3, [r1, #0x3c]
// str r3, [r0, #0x3c]
// This is a bit detrimental for sizes between 64 and 256 (less than 10%
// penalty) but the prefetch yields better throughput for larger copies.
consume_by_block<64, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src,
size);
consume_by_block<16, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src,
size);
consume_by_block<4, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src, size);
copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size);
copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size);
copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size);
if (size & 1)
copy_block_and_bump_pointers<1>(dst, src);
copy_and_bump_pointers<1>(dst, src);
if (size & 2)
copy_block_and_bump_pointers<2>(dst, src);
LIBC_ATTR_UNLIKELY
copy_and_bump_pointers<2>(dst, src);
}

[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src,
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
const void *__restrict src_,
size_t size) {
// The compiler performs alias analysis and is able to prove that `dst` and
// `src` do not alias by propagating the `__restrict` keyword from the
// `memcpy` prototype. This allows the compiler to merge consecutive
// load/store (LDR, STR) instructions generated in
// `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
// double (LDRD, STRD) instructions, this is is undesirable so we prevent the
// compiler from inferring `__restrict` with the following line.
asm volatile("" : "+r"(dst), "+r"(src));
Ptr dst = cpp::bit_cast<Ptr>(dst_);
CPtr src = cpp::bit_cast<CPtr>(src_);
#ifdef __ARM_FEATURE_UNALIGNED
return inline_memcpy_arm_mid_end(dst, src, size);
#else
Expand All @@ -213,4 +210,8 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {

} // namespace LIBC_NAMESPACE_DECL

// Cleanup local macros
#undef LIBC_ATTR_LIKELY
#undef LIBC_ATTR_UNLIKELY

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1 change: 0 additions & 1 deletion utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4448,7 +4448,6 @@ libc_support_library(
"src/string/memory_utils/aarch64/inline_memcpy.h",
"src/string/memory_utils/aarch64/inline_memmove.h",
"src/string/memory_utils/aarch64/inline_memset.h",
"src/string/memory_utils/arm/common.h",
"src/string/memory_utils/arm/inline_memcpy.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
Expand Down
Loading