Skip to content

Commit e0cce5c

Browse files
authored
[libc] Improve Cortex memset and memcpy functions (#149044)
The code for `memcpy` is the same as in #148204 but it fixes the build bot error by using `static_assert(cpp::always_false<decltype(access)>)` instead of `static_assert(false)` (older compilers fails on `static_assert(false)` in `constexpr` `else` bodies). The code for `memset` is new and vastly improves performance over the current byte per byte implementation. Both `memset` and `memcpy` implementations use prefetching for sizes >= 64. This lowers a bit the performance for sizes between 64 and 256 but improves throughput for greater sizes.
1 parent 8b553c4 commit e0cce5c

File tree

6 files changed

+315
-99
lines changed

6 files changed

+315
-99
lines changed

libc/src/string/memory_utils/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ add_header_library(
77
aarch64/inline_memcpy.h
88
aarch64/inline_memmove.h
99
aarch64/inline_memset.h
10+
arm/common.h
1011
arm/inline_memcpy.h
12+
arm/inline_memset.h
1113
generic/aligned_access.h
1214
generic/byte_per_byte.h
1315
inline_bcmp.h
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===-- Common constants and defines for arm --------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
10+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
11+
12+
#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR
13+
#include "src/string/memory_utils/utils.h" // CPtr, Ptr, distance_to_align
14+
15+
#include <stddef.h> // size_t
16+
17+
// Our minimum supported compiler version does not recognize the standard
18+
// [[likely]] / [[unlikely]] attributes so we use the preprocessor.
19+
20+
// https://libc.llvm.org/compiler_support.html
21+
// Support for [[likely]] / [[unlikely]]
22+
// [X] GCC 12.2
23+
// [X] Clang 12
24+
// [ ] Clang 11
25+
#define LIBC_ATTR_LIKELY [[likely]]
26+
#define LIBC_ATTR_UNLIKELY [[unlikely]]
27+
28+
#if defined(LIBC_COMPILER_IS_CLANG)
29+
#if LIBC_COMPILER_CLANG_VER < 1200
30+
#undef LIBC_ATTR_LIKELY
31+
#undef LIBC_ATTR_UNLIKELY
32+
#define LIBC_ATTR_LIKELY
33+
#define LIBC_ATTR_UNLIKELY
34+
#endif
35+
#endif
36+
37+
namespace LIBC_NAMESPACE_DECL {
38+
39+
LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
40+
41+
enum class AssumeAccess { kUnknown, kAligned };
42+
enum class BlockOp { kFull, kByWord };
43+
44+
LIBC_INLINE auto misaligned(CPtr ptr) {
45+
return distance_to_align_down<kWordSize>(ptr);
46+
}
47+
48+
LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
49+
return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
50+
cpp::bit_cast<uintptr_t>(b));
51+
}
52+
53+
} // namespace LIBC_NAMESPACE_DECL
54+
55+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H

libc/src/string/memory_utils/arm/inline_memcpy.h

Lines changed: 95 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -5,63 +5,57 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8+
// The functions defined in this file give approximate code size. These sizes
9+
// assume the following configuration options:
10+
// - LIBC_CONF_KEEP_FRAME_POINTER = false
11+
// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12+
// - LIBC_ADD_NULL_CHECKS = false
813
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
914
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1015

16+
#include "src/__support/CPP/type_traits.h" // always_false
1117
#include "src/__support/macros/attributes.h" // LIBC_INLINE
1218
#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
19+
#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
1320
#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
1421

1522
#include <stddef.h> // size_t
1623

17-
// https://libc.llvm.org/compiler_support.html
18-
// Support for [[likely]] / [[unlikely]]
19-
// [X] GCC 12.2
20-
// [X] Clang 12
21-
// [ ] Clang 11
22-
#define LIBC_ATTR_LIKELY [[likely]]
23-
#define LIBC_ATTR_UNLIKELY [[unlikely]]
24-
25-
#if defined(LIBC_COMPILER_IS_CLANG)
26-
#if LIBC_COMPILER_CLANG_VER < 1200
27-
#undef LIBC_ATTR_LIKELY
28-
#undef LIBC_ATTR_UNLIKELY
29-
#define LIBC_ATTR_LIKELY
30-
#define LIBC_ATTR_UNLIKELY
31-
#endif
32-
#endif
33-
3424
namespace LIBC_NAMESPACE_DECL {
3525

3626
namespace {
3727

38-
LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
39-
40-
enum Strategy {
41-
ForceWordLdStChain,
42-
AssumeWordAligned,
43-
AssumeUnaligned,
44-
};
28+
// Performs a copy of `bytes` byte from `src` to `dst`. This function has the
29+
// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
30+
// free to use whatever instruction is best for the size and assumed access.
31+
template <size_t bytes, AssumeAccess access>
32+
LIBC_INLINE void copy(void *dst, const void *src) {
33+
if constexpr (access == AssumeAccess::kAligned) {
34+
constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
35+
memcpy_inline<bytes>(assume_aligned<alignment>(dst),
36+
assume_aligned<alignment>(src));
37+
} else if constexpr (access == AssumeAccess::kUnknown) {
38+
memcpy_inline<bytes>(dst, src);
39+
} else {
40+
static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
41+
}
42+
}
4543

46-
template <size_t bytes, Strategy strategy = AssumeUnaligned>
47-
LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
48-
if constexpr (strategy == AssumeUnaligned) {
49-
memcpy_inline<bytes>(assume_aligned<1>(dst), assume_aligned<1>(src));
50-
} else if constexpr (strategy == AssumeWordAligned) {
51-
static_assert(bytes >= kWordSize);
52-
memcpy_inline<bytes>(assume_aligned<kWordSize>(dst),
53-
assume_aligned<kWordSize>(src));
54-
} else if constexpr (strategy == ForceWordLdStChain) {
44+
template <size_t bytes, BlockOp block_op = BlockOp::kFull,
45+
AssumeAccess access = AssumeAccess::kUnknown>
46+
LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
47+
if constexpr (block_op == BlockOp::kFull) {
48+
copy<bytes, access>(dst, src);
49+
} else if constexpr (block_op == BlockOp::kByWord) {
5550
// We restrict loads/stores to 4 byte to prevent the use of load/store
56-
// multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57-
// fault (see notes below) and second, they use more registers which in turn
58-
// adds push/pop instructions in the hot path.
51+
// multiple (LDM, STM) and load/store double (LDRD, STRD).
5952
static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
6053
LIBC_LOOP_UNROLL
61-
for (size_t i = 0; i < bytes / kWordSize; ++i) {
62-
const size_t offset = i * kWordSize;
63-
memcpy_inline<kWordSize>(dst + offset, src + offset);
54+
for (size_t offset = 0; offset < bytes; offset += kWordSize) {
55+
copy<kWordSize, access>(dst + offset, src + offset);
6456
}
57+
} else {
58+
static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp");
6559
}
6660
// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
6761
// into the load/store instructions.
@@ -72,39 +66,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
7266
src += bytes;
7367
}
7468

75-
LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
76-
const size_t size) {
69+
template <size_t bytes, BlockOp block_op, AssumeAccess access>
70+
LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) {
7771
LIBC_LOOP_NOUNROLL
78-
for (size_t i = 0; i < size; ++i)
79-
*dst++ = *src++;
72+
for (size_t i = 0; i < size / bytes; ++i)
73+
copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
74+
size %= bytes;
8075
}
8176

82-
template <size_t block_size, Strategy strategy>
83-
LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src,
84-
size_t &size) {
77+
[[maybe_unused]] LIBC_INLINE void
78+
copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
8579
LIBC_LOOP_NOUNROLL
86-
for (size_t i = 0; i < size / block_size; ++i)
87-
copy_and_bump_pointers<block_size, strategy>(dst, src);
88-
// Update `size` once at the end instead of once per iteration.
89-
size %= block_size;
90-
}
91-
92-
LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
93-
return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
94-
cpp::bit_cast<uintptr_t>(b));
95-
}
96-
97-
LIBC_INLINE auto misaligned(CPtr a) {
98-
return distance_to_align_down<kWordSize>(a);
80+
for (size_t i = 0; i < size; ++i)
81+
*dst++ = *src++;
9982
}
10083

10184
} // namespace
10285

103-
// Implementation for Cortex-M0, M0+, M1.
104-
// Notes:
105-
// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106-
// that also needs to return the `dst` ptr.
107-
// - These cores do not allow for unaligned loads/stores.
86+
// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
87+
// loads/stores. It compiles down to 208 bytes when used through `memcpy` that
88+
// also needs to return the `dst` ptr.
89+
// Note:
10890
// - When `src` and `dst` are coaligned, we start by aligning them and perform
10991
// bulk copies. We let the compiler know the pointers are aligned so it can
11092
// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -121,13 +103,20 @@ LIBC_INLINE auto misaligned(CPtr a) {
121103
copy_bytes_and_bump_pointers(dst, src, offset);
122104
size -= offset;
123105
}
106+
constexpr AssumeAccess kAligned = AssumeAccess::kAligned;
124107
const auto src_alignment = distance_to_align_down<kWordSize>(src);
125108
if (src_alignment == 0)
126109
LIBC_ATTR_LIKELY {
127110
// Both `src` and `dst` are now word-aligned.
128-
copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size);
129-
copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size);
130-
copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size);
111+
// We first copy by blocks of 64 bytes, the compiler will use 4
112+
// load/store multiple (LDM, STM), each of 4 words. This requires more
113+
// registers so additional push/pop are needed but the speedup is worth
114+
// it.
115+
consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size);
116+
// Then we use blocks of 4 word load/store.
117+
consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size);
118+
// Then we use word by word copy.
119+
consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size);
131120
}
132121
else {
133122
// `dst` is aligned but `src` is not.
@@ -138,7 +127,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
138127
src_alignment == 2
139128
? load_aligned<uint32_t, uint16_t, uint16_t>(src)
140129
: load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
141-
memcpy_inline<kWordSize>(assume_aligned<kWordSize>(dst), &value);
130+
copy<kWordSize, kAligned>(dst, &value);
142131
dst += kWordSize;
143132
src += kWordSize;
144133
size -= kWordSize;
@@ -151,56 +140,68 @@ LIBC_INLINE auto misaligned(CPtr a) {
151140
}
152141

153142
// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
154-
// support for unaligned loads and stores.
155-
// Notes:
156-
// - It compiles down to 266 bytes.
157-
// - `dst` and `src` are not `__restrict` to prevent the compiler from
158-
// reordering loads/stores.
159-
// - We keep state variables to a strict minimum to keep everything in the free
160-
// registers and prevent costly push / pop.
161-
// - If unaligned single loads/stores to normal memory are supported, unaligned
162-
// accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163-
// STRD) instructions are generally not supported and will still fault so we
164-
// make sure to restrict unrolling to word loads/stores.
143+
// support for unaligned loads and stores. It compiles down to 272 bytes when
144+
// used through `memcpy` that also needs to return the `dst` ptr.
165145
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
166146
size_t size) {
167147
if (misaligned(bitwise_or(src, dst)))
168148
LIBC_ATTR_UNLIKELY {
169149
if (size < 8)
170150
LIBC_ATTR_UNLIKELY {
171151
if (size & 1)
172-
copy_and_bump_pointers<1>(dst, src);
152+
copy_block_and_bump_pointers<1>(dst, src);
173153
if (size & 2)
174-
copy_and_bump_pointers<2>(dst, src);
154+
copy_block_and_bump_pointers<2>(dst, src);
175155
if (size & 4)
176-
copy_and_bump_pointers<4>(dst, src);
156+
copy_block_and_bump_pointers<4>(dst, src);
177157
return;
178158
}
179159
if (misaligned(src))
180160
LIBC_ATTR_UNLIKELY {
181161
const size_t offset = distance_to_align_up<kWordSize>(dst);
182162
if (offset & 1)
183-
copy_and_bump_pointers<1>(dst, src);
163+
copy_block_and_bump_pointers<1>(dst, src);
184164
if (offset & 2)
185-
copy_and_bump_pointers<2>(dst, src);
165+
copy_block_and_bump_pointers<2>(dst, src);
186166
size -= offset;
187167
}
188168
}
189-
copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size);
190-
copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size);
191-
copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size);
169+
// `dst` and `src` are not necessarily both aligned at that point but this
170+
// implementation assumes hardware support for unaligned loads and stores so
171+
// it is still fast to perform unrolled word by word copy. Note that wider
172+
// accesses through the use of load/store multiple (LDM, STM) and load/store
173+
// double (LDRD, STRD) instructions are generally not supported and can fault.
174+
// By forcing decomposition of 64 bytes copy into word by word copy, the
175+
// compiler uses a load to prefetch the next cache line:
176+
// ldr r3, [r1, #64]! <- prefetch next cache line
177+
// str r3, [r0]
178+
// ldr r3, [r1, #0x4]
179+
// str r3, [r0, #0x4]
180+
// ...
181+
// ldr r3, [r1, #0x3c]
182+
// str r3, [r0, #0x3c]
183+
// This is a bit detrimental for sizes between 64 and 256 (less than 10%
184+
// penalty) but the prefetch yields better throughput for larger copies.
185+
constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown;
186+
consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size);
187+
consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size);
188+
consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size);
192189
if (size & 1)
193-
copy_and_bump_pointers<1>(dst, src);
190+
copy_block_and_bump_pointers<1>(dst, src);
194191
if (size & 2)
195-
LIBC_ATTR_UNLIKELY
196-
copy_and_bump_pointers<2>(dst, src);
192+
copy_block_and_bump_pointers<2>(dst, src);
197193
}
198194

199-
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
200-
const void *__restrict src_,
195+
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src,
201196
size_t size) {
202-
Ptr dst = cpp::bit_cast<Ptr>(dst_);
203-
CPtr src = cpp::bit_cast<CPtr>(src_);
197+
// The compiler performs alias analysis and is able to prove that `dst` and
198+
// `src` do not alias by propagating the `__restrict` keyword from the
199+
// `memcpy` prototype. This allows the compiler to merge consecutive
200+
// load/store (LDR, STR) instructions generated in
201+
// `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
202+
// double (LDRD, STRD) instructions, this is is undesirable so we prevent the
203+
// compiler from inferring `__restrict` with the following line.
204+
asm volatile("" : "+r"(dst), "+r"(src));
204205
#ifdef __ARM_FEATURE_UNALIGNED
205206
return inline_memcpy_arm_mid_end(dst, src, size);
206207
#else
@@ -210,8 +211,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
210211

211212
} // namespace LIBC_NAMESPACE_DECL
212213

213-
// Cleanup local macros
214-
#undef LIBC_ATTR_LIKELY
215-
#undef LIBC_ATTR_UNLIKELY
216-
217214
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H

0 commit comments

Comments
 (0)