5
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
6
//
7
7
// ===----------------------------------------------------------------------===//
8
+ // The functions defined in this file give approximate code size. These sizes
9
+ // assume the following configuration options:
10
+ // - LIBC_CONF_KEEP_FRAME_POINTER = false
11
+ // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12
+ // - LIBC_ADD_NULL_CHECKS = false
8
13
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
9
14
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
10
15
16
+ #include " src/__support/CPP/type_traits.h" // always_false
11
17
#include " src/__support/macros/attributes.h" // LIBC_INLINE
12
18
#include " src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
19
+ #include " src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
13
20
#include " src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
14
21
15
22
#include < stddef.h> // size_t
16
23
17
- // https://libc.llvm.org/compiler_support.html
18
- // Support for [[likely]] / [[unlikely]]
19
- // [X] GCC 12.2
20
- // [X] Clang 12
21
- // [ ] Clang 11
22
- #define LIBC_ATTR_LIKELY [[likely]]
23
- #define LIBC_ATTR_UNLIKELY [[unlikely]]
24
-
25
- #if defined(LIBC_COMPILER_IS_CLANG)
26
- #if LIBC_COMPILER_CLANG_VER < 1200
27
- #undef LIBC_ATTR_LIKELY
28
- #undef LIBC_ATTR_UNLIKELY
29
- #define LIBC_ATTR_LIKELY
30
- #define LIBC_ATTR_UNLIKELY
31
- #endif
32
- #endif
33
-
34
24
namespace LIBC_NAMESPACE_DECL {
35
25
36
26
namespace {
37
27
38
- LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
39
-
40
- enum Strategy {
41
- ForceWordLdStChain,
42
- AssumeWordAligned,
43
- AssumeUnaligned,
44
- };
28
+ // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
29
+ // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
30
+ // free to use whatever instruction is best for the size and assumed access.
31
+ template <size_t bytes, AssumeAccess access>
32
+ LIBC_INLINE void copy (void *dst, const void *src) {
33
+ if constexpr (access == AssumeAccess::kAligned ) {
34
+ constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
35
+ memcpy_inline<bytes>(assume_aligned<alignment>(dst),
36
+ assume_aligned<alignment>(src));
37
+ } else if constexpr (access == AssumeAccess::kUnknown ) {
38
+ memcpy_inline<bytes>(dst, src);
39
+ } else {
40
+ static_assert (cpp::always_false<decltype (access)>, " Invalid AssumeAccess" );
41
+ }
42
+ }
45
43
46
- template <size_t bytes, Strategy strategy = AssumeUnaligned>
47
- LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
48
- if constexpr (strategy == AssumeUnaligned) {
49
- memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
50
- } else if constexpr (strategy == AssumeWordAligned) {
51
- static_assert (bytes >= kWordSize );
52
- memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
53
- assume_aligned<kWordSize >(src));
54
- } else if constexpr (strategy == ForceWordLdStChain) {
44
+ template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
45
+ AssumeAccess access = AssumeAccess::kUnknown >
46
+ LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
47
+ if constexpr (block_op == BlockOp::kFull ) {
48
+ copy<bytes, access>(dst, src);
49
+ } else if constexpr (block_op == BlockOp::kByWord ) {
55
50
// We restrict loads/stores to 4 byte to prevent the use of load/store
56
- // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57
- // fault (see notes below) and second, they use more registers which in turn
58
- // adds push/pop instructions in the hot path.
51
+ // multiple (LDM, STM) and load/store double (LDRD, STRD).
59
52
static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
60
53
LIBC_LOOP_UNROLL
61
- for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
62
- const size_t offset = i * kWordSize ;
63
- memcpy_inline<kWordSize >(dst + offset, src + offset);
54
+ for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
55
+ copy<kWordSize , access>(dst + offset, src + offset);
64
56
}
57
+ } else {
58
+ static_assert (cpp::always_false<decltype (block_op)>, " Invalid BlockOp" );
65
59
}
66
60
// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
67
61
// into the load/store instructions.
@@ -72,39 +66,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
72
66
src += bytes;
73
67
}
74
68
75
- LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
76
- const size_t size) {
69
+ template < size_t bytes, BlockOp block_op, AssumeAccess access>
70
+ LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t & size) {
77
71
LIBC_LOOP_NOUNROLL
78
- for (size_t i = 0 ; i < size; ++i)
79
- *dst++ = *src++;
72
+ for (size_t i = 0 ; i < size / bytes; ++i)
73
+ copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
74
+ size %= bytes;
80
75
}
81
76
82
- template <size_t block_size, Strategy strategy>
83
- LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
84
- size_t &size) {
77
+ [[maybe_unused]] LIBC_INLINE void
78
+ copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
85
79
LIBC_LOOP_NOUNROLL
86
- for (size_t i = 0 ; i < size / block_size; ++i)
87
- copy_and_bump_pointers<block_size, strategy>(dst, src);
88
- // Update `size` once at the end instead of once per iteration.
89
- size %= block_size;
90
- }
91
-
92
- LIBC_INLINE CPtr bitwise_or (CPtr a, CPtr b) {
93
- return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t >(a) |
94
- cpp::bit_cast<uintptr_t >(b));
95
- }
96
-
97
- LIBC_INLINE auto misaligned (CPtr a) {
98
- return distance_to_align_down<kWordSize >(a);
80
+ for (size_t i = 0 ; i < size; ++i)
81
+ *dst++ = *src++;
99
82
}
100
83
101
84
} // namespace
102
85
103
- // Implementation for Cortex-M0, M0+, M1.
104
- // Notes:
105
- // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106
- // that also needs to return the `dst` ptr.
107
- // - These cores do not allow for unaligned loads/stores.
86
+ // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
87
+ // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
88
+ // also needs to return the `dst` ptr.
89
+ // Note:
108
90
// - When `src` and `dst` are coaligned, we start by aligning them and perform
109
91
// bulk copies. We let the compiler know the pointers are aligned so it can
110
92
// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -121,13 +103,20 @@ LIBC_INLINE auto misaligned(CPtr a) {
121
103
copy_bytes_and_bump_pointers (dst, src, offset);
122
104
size -= offset;
123
105
}
106
+ constexpr AssumeAccess kAligned = AssumeAccess::kAligned ;
124
107
const auto src_alignment = distance_to_align_down<kWordSize >(src);
125
108
if (src_alignment == 0 )
126
109
LIBC_ATTR_LIKELY {
127
110
// Both `src` and `dst` are now word-aligned.
128
- copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
129
- copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
130
- copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
111
+ // We first copy by blocks of 64 bytes, the compiler will use 4
112
+ // load/store multiple (LDM, STM), each of 4 words. This requires more
113
+ // registers so additional push/pop are needed but the speedup is worth
114
+ // it.
115
+ consume_by_block<64 , BlockOp::kFull , kAligned >(dst, src, size);
116
+ // Then we use blocks of 4 word load/store.
117
+ consume_by_block<16 , BlockOp::kByWord , kAligned >(dst, src, size);
118
+ // Then we use word by word copy.
119
+ consume_by_block<4 , BlockOp::kByWord , kAligned >(dst, src, size);
131
120
}
132
121
else {
133
122
// `dst` is aligned but `src` is not.
@@ -138,7 +127,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
138
127
src_alignment == 2
139
128
? load_aligned<uint32_t , uint16_t , uint16_t >(src)
140
129
: load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
141
- memcpy_inline <kWordSize >(assume_aligned< kWordSize >( dst) , &value);
130
+ copy <kWordSize , kAligned >( dst, &value);
142
131
dst += kWordSize ;
143
132
src += kWordSize ;
144
133
size -= kWordSize ;
@@ -151,56 +140,68 @@ LIBC_INLINE auto misaligned(CPtr a) {
151
140
}
152
141
153
142
// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
154
- // support for unaligned loads and stores.
155
- // Notes:
156
- // - It compiles down to 266 bytes.
157
- // - `dst` and `src` are not `__restrict` to prevent the compiler from
158
- // reordering loads/stores.
159
- // - We keep state variables to a strict minimum to keep everything in the free
160
- // registers and prevent costly push / pop.
161
- // - If unaligned single loads/stores to normal memory are supported, unaligned
162
- // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163
- // STRD) instructions are generally not supported and will still fault so we
164
- // make sure to restrict unrolling to word loads/stores.
143
+ // support for unaligned loads and stores. It compiles down to 272 bytes when
144
+ // used through `memcpy` that also needs to return the `dst` ptr.
165
145
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
166
146
size_t size) {
167
147
if (misaligned (bitwise_or (src, dst)))
168
148
LIBC_ATTR_UNLIKELY {
169
149
if (size < 8 )
170
150
LIBC_ATTR_UNLIKELY {
171
151
if (size & 1 )
172
- copy_and_bump_pointers <1 >(dst, src);
152
+ copy_block_and_bump_pointers <1 >(dst, src);
173
153
if (size & 2 )
174
- copy_and_bump_pointers <2 >(dst, src);
154
+ copy_block_and_bump_pointers <2 >(dst, src);
175
155
if (size & 4 )
176
- copy_and_bump_pointers <4 >(dst, src);
156
+ copy_block_and_bump_pointers <4 >(dst, src);
177
157
return ;
178
158
}
179
159
if (misaligned (src))
180
160
LIBC_ATTR_UNLIKELY {
181
161
const size_t offset = distance_to_align_up<kWordSize >(dst);
182
162
if (offset & 1 )
183
- copy_and_bump_pointers <1 >(dst, src);
163
+ copy_block_and_bump_pointers <1 >(dst, src);
184
164
if (offset & 2 )
185
- copy_and_bump_pointers <2 >(dst, src);
165
+ copy_block_and_bump_pointers <2 >(dst, src);
186
166
size -= offset;
187
167
}
188
168
}
189
- copy_blocks_and_update_args<64 , ForceWordLdStChain>(dst, src, size);
190
- copy_blocks_and_update_args<16 , ForceWordLdStChain>(dst, src, size);
191
- copy_blocks_and_update_args<4 , AssumeUnaligned>(dst, src, size);
169
+ // `dst` and `src` are not necessarily both aligned at that point but this
170
+ // implementation assumes hardware support for unaligned loads and stores so
171
+ // it is still fast to perform unrolled word by word copy. Note that wider
172
+ // accesses through the use of load/store multiple (LDM, STM) and load/store
173
+ // double (LDRD, STRD) instructions are generally not supported and can fault.
174
+ // By forcing decomposition of 64 bytes copy into word by word copy, the
175
+ // compiler uses a load to prefetch the next cache line:
176
+ // ldr r3, [r1, #64]! <- prefetch next cache line
177
+ // str r3, [r0]
178
+ // ldr r3, [r1, #0x4]
179
+ // str r3, [r0, #0x4]
180
+ // ...
181
+ // ldr r3, [r1, #0x3c]
182
+ // str r3, [r0, #0x3c]
183
+ // This is a bit detrimental for sizes between 64 and 256 (less than 10%
184
+ // penalty) but the prefetch yields better throughput for larger copies.
185
+ constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown ;
186
+ consume_by_block<64 , BlockOp::kByWord , kUnknown >(dst, src, size);
187
+ consume_by_block<16 , BlockOp::kByWord , kUnknown >(dst, src, size);
188
+ consume_by_block<4 , BlockOp::kByWord , kUnknown >(dst, src, size);
192
189
if (size & 1 )
193
- copy_and_bump_pointers <1 >(dst, src);
190
+ copy_block_and_bump_pointers <1 >(dst, src);
194
191
if (size & 2 )
195
- LIBC_ATTR_UNLIKELY
196
- copy_and_bump_pointers<2 >(dst, src);
192
+ copy_block_and_bump_pointers<2 >(dst, src);
197
193
}
198
194
199
- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
200
- const void *__restrict src_,
195
+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
201
196
size_t size) {
202
- Ptr dst = cpp::bit_cast<Ptr>(dst_);
203
- CPtr src = cpp::bit_cast<CPtr>(src_);
197
+ // The compiler performs alias analysis and is able to prove that `dst` and
198
+ // `src` do not alias by propagating the `__restrict` keyword from the
199
+ // `memcpy` prototype. This allows the compiler to merge consecutive
200
+ // load/store (LDR, STR) instructions generated in
201
+ // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
202
+ // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
203
+ // compiler from inferring `__restrict` with the following line.
204
+ asm volatile (" " : " +r" (dst), " +r" (src));
204
205
#ifdef __ARM_FEATURE_UNALIGNED
205
206
return inline_memcpy_arm_mid_end (dst, src, size);
206
207
#else
@@ -210,8 +211,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
210
211
211
212
} // namespace LIBC_NAMESPACE_DECL
212
213
213
- // Cleanup local macros
214
- #undef LIBC_ATTR_LIKELY
215
- #undef LIBC_ATTR_UNLIKELY
216
-
217
214
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
0 commit comments