Skip to content

Commit 8139e16

Browse files
authored
Merge pull request #198 from mayawarrier/main
Add opt-in SIMD support for char16_t
2 parents 127a6c7 + b711947 commit 8139e16

File tree

6 files changed

+173
-61
lines changed

6 files changed

+173
-61
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@ Testing/*
33
.cache/
44
compile_commands.json
55

6-
# Visual Studio
6+
# Visual studio
77
.vs/
88
Debug/
99
Release/
10+
/out/
1011
*.sln
1112
*.vcxproj
1213
*.vcxproj.filters

CONTRIBUTORS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ Neal Richardson
55
Tim Paine
66
Fabio Pellacini
77
Lénárd Szolnoki
8-
Jan Pharago
8+
Jan Pharago
9+
Maya Warrier

include/fast_float/ascii_number.h

Lines changed: 131 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,26 @@
55
#include <cstdint>
66
#include <cstring>
77
#include <iterator>
8+
#include <type_traits>
89

910
#include "float_common.h"
1011

12+
#ifdef FASTFLOAT_SSE2
13+
#include <emmintrin.h>
14+
#endif
15+
16+
1117
namespace fast_float {
1218

19+
template <typename UC>
20+
fastfloat_really_inline constexpr bool has_simd_opt() {
21+
#ifdef FASTFLOAT_HAS_SIMD
22+
return std::is_same<UC, char16_t>::value;
23+
#else
24+
return false;
25+
#endif
26+
}
27+
1328
// Next function can be micro-optimized, but compilers are entirely
1429
// able to optimize it well.
1530
template <typename UC>
@@ -28,12 +43,14 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
2843
| (val & 0x00000000000000FF) << 56;
2944
}
3045

46+
// Read 8 UC into a u64. Truncates UC if not char.
47+
template <typename UC>
3148
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
32-
uint64_t read_u64(const char *chars) {
33-
if (cpp20_and_in_constexpr()) {
49+
uint64_t read8_to_u64(const UC *chars) {
50+
if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
3451
uint64_t val = 0;
3552
for(int i = 0; i < 8; ++i) {
36-
val |= uint64_t(*chars) << (i*8);
53+
val |= uint64_t(uint8_t(*chars)) << (i*8);
3754
++chars;
3855
}
3956
return val;
@@ -47,6 +64,39 @@ uint64_t read_u64(const char *chars) {
4764
return val;
4865
}
4966

67+
#ifdef FASTFLOAT_SSE2
68+
69+
fastfloat_really_inline
70+
uint64_t simd_read8_to_u64(const __m128i data) {
71+
FASTFLOAT_SIMD_DISABLE_WARNINGS
72+
const __m128i packed = _mm_packus_epi16(data, data);
73+
#ifdef FASTFLOAT_64BIT
74+
return uint64_t(_mm_cvtsi128_si64(packed));
75+
#else
76+
uint64_t value;
77+
// Visual Studio + older versions of GCC don't support _mm_storeu_si64
78+
_mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed);
79+
return value;
80+
#endif
81+
FASTFLOAT_SIMD_RESTORE_WARNINGS
82+
}
83+
84+
fastfloat_really_inline
85+
uint64_t simd_read8_to_u64(const char16_t* chars) {
86+
FASTFLOAT_SIMD_DISABLE_WARNINGS
87+
return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast<const __m128i*>(chars)));
88+
FASTFLOAT_SIMD_RESTORE_WARNINGS
89+
}
90+
91+
#endif
92+
93+
// dummy for compile
94+
template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
95+
uint64_t simd_read8_to_u64(UC const*) {
96+
return 0;
97+
}
98+
99+
50100
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
51101
void write_u64(uint8_t *chars, uint64_t val) {
52102
if (cpp20_and_in_constexpr()) {
@@ -76,40 +126,80 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
76126
return uint32_t(val);
77127
}
78128

79-
fastfloat_really_inline constexpr
80-
uint32_t parse_eight_digits_unrolled(const char16_t *) noexcept {
81-
return 0;
82-
}
83-
84-
fastfloat_really_inline constexpr
85-
uint32_t parse_eight_digits_unrolled(const char32_t *) noexcept {
86-
return 0;
87-
}
88129

130+
// Call this if chars are definitely 8 digits.
131+
template <typename UC>
89132
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
90-
uint32_t parse_eight_digits_unrolled(const char *chars) noexcept {
91-
return parse_eight_digits_unrolled(read_u64(chars));
133+
uint32_t parse_eight_digits_unrolled(UC const * chars) noexcept {
134+
if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
135+
return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
136+
}
137+
return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
92138
}
93139

140+
94141
// credit @aqrit
95-
fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept {
142+
fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept {
96143
return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
97144
0x8080808080808080));
98145
}
99146

100-
fastfloat_really_inline constexpr
101-
bool is_made_of_eight_digits_fast(const char16_t *) noexcept {
102-
return false;
147+
148+
#ifdef FASTFLOAT_HAS_SIMD
149+
150+
// Call this if chars might not be 8 digits.
151+
// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
152+
// ensures we don't load SIMD registers twice.
153+
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
154+
bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
155+
if (cpp20_and_in_constexpr()) {
156+
return false;
157+
}
158+
#ifdef FASTFLOAT_SSE2
159+
FASTFLOAT_SIMD_DISABLE_WARNINGS
160+
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
161+
162+
// (x - '0') <= 9
163+
// http://0x80.pl/articles/simd-parsing-int-sequences.html
164+
const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720));
165+
const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759));
166+
167+
if (_mm_movemask_epi8(t1) == 0) {
168+
i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
169+
return true;
170+
}
171+
else return false;
172+
FASTFLOAT_SIMD_RESTORE_WARNINGS
173+
#endif
103174
}
104175

105-
fastfloat_really_inline constexpr
106-
bool is_made_of_eight_digits_fast(const char32_t *) noexcept {
107-
return false;
176+
#endif
177+
178+
// dummy for compile
179+
template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
180+
uint64_t simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) {
181+
return 0;
182+
}
183+
184+
185+
template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value)>
186+
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
187+
void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) {
188+
if (!has_simd_opt<UC>()) {
189+
return;
190+
}
191+
while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok
192+
p += 8;
193+
}
108194
}
109195

110196
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
111-
bool is_made_of_eight_digits_fast(const char *chars) noexcept {
112-
return is_made_of_eight_digits_fast(read_u64(chars));
197+
void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) {
198+
// optimizes better than parse_if_eight_digits_unrolled() for UC = char.
199+
while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) {
200+
i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok
201+
p += 8;
202+
}
113203
}
114204

115205
template <typename UC>
@@ -124,8 +214,10 @@ struct parsed_number_string_t {
124214
span<const UC> integer{}; // non-nullable
125215
span<const UC> fraction{}; // nullable
126216
};
127-
using byte_span = span<char>;
217+
218+
using byte_span = span<const char>;
128219
using parsed_number_string = parsed_number_string_t<char>;
220+
129221
// Assuming that you use no more than 19 digits, this will
130222
// parse an ASCII string.
131223
template <typename UC>
@@ -171,12 +263,8 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
171263
UC const * before = p;
172264
// can occur at most twice without overflowing, but let it occur more, since
173265
// for integers with many digits, digit parsing is the primary bottleneck.
174-
if (std::is_same<UC,char>::value) {
175-
while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
176-
i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
177-
p += 8;
178-
}
179-
}
266+
loop_parse_if_eight_digits(p, pend, i);
267+
180268
while ((p != pend) && is_integer(*p)) {
181269
uint8_t digit = uint8_t(*p - UC('0'));
182270
++p;
@@ -241,29 +329,31 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
241329
if(*start == UC('0')) { digit_count --; }
242330
start++;
243331
}
332+
244333
if (digit_count > 19) {
245334
answer.too_many_digits = true;
246335
// Let us start again, this time, avoiding overflows.
247336
// We don't need to check if is_integer, since we use the
248337
// pre-tokenized spans from above.
249338
i = 0;
250339
p = answer.integer.ptr;
251-
UC const * int_end = p + answer.integer.len();
252-
const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
253-
while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
340+
UC const* int_end = p + answer.integer.len();
341+
const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 };
342+
while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
254343
i = i * 10 + uint64_t(*p - UC('0'));
255344
++p;
256345
}
257346
if (i >= minimal_nineteen_digit_integer) { // We have a big integers
258347
exponent = end_of_integer_part - p + exp_number;
259-
} else { // We have a value with a fractional component.
260-
p = answer.fraction.ptr;
261-
UC const * frac_end = p + answer.fraction.len();
262-
while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
263-
i = i * 10 + uint64_t(*p - UC('0'));
264-
++p;
265-
}
266-
exponent = answer.fraction.ptr - p + exp_number;
348+
}
349+
else { // We have a value with a fractional component.
350+
p = answer.fraction.ptr;
351+
UC const* frac_end = p + answer.fraction.len();
352+
while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
353+
i = i * 10 + uint64_t(*p - UC('0'));
354+
++p;
355+
}
356+
exponent = answer.fraction.ptr - p + exp_number;
267357
}
268358
// We have now corrected both exponent and i, to a truncated value
269359
}

include/fast_float/digit_comparison.h

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -201,18 +201,10 @@ bool is_truncated(span<const UC> s) noexcept {
201201
return is_truncated(s.ptr, s.ptr + s.len());
202202
}
203203

204-
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
205-
void parse_eight_digits(const char16_t*& , limb& , size_t& , size_t& ) noexcept {
206-
// currently unused
207-
}
208-
209-
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
210-
void parse_eight_digits(const char32_t*& , limb& , size_t& , size_t& ) noexcept {
211-
// currently unused
212-
}
213204

205+
template <typename UC>
214206
fastfloat_really_inline FASTFLOAT_CONSTEXPR20
215-
void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& count) noexcept {
207+
void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept {
216208
value = value * 100000000 + parse_eight_digits_unrolled(p);
217209
p += 8;
218210
counter += 8;
@@ -264,10 +256,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
264256
skip_zeros(p, pend);
265257
// process all digits, in increments of step per loop
266258
while (p != pend) {
267-
if (std::is_same<UC,char>::value) {
268-
while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
269-
parse_eight_digits(p, value, counter, digits);
270-
}
259+
while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
260+
parse_eight_digits(p, value, counter, digits);
271261
}
272262
while (counter < step && p != pend && digits < max_digits) {
273263
parse_one_digit(p, value, counter, digits);
@@ -299,10 +289,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
299289
}
300290
// process all digits, in increments of step per loop
301291
while (p != pend) {
302-
if (std::is_same<UC,char>::value) {
303-
while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
304-
parse_eight_digits(p, value, counter, digits);
305-
}
292+
while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
293+
parse_eight_digits(p, value, counter, digits);
306294
}
307295
while (counter < step && p != pend && digits < max_digits) {
308296
parse_one_digit(p, value, counter, digits);

include/fast_float/float_common.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,34 @@ using parse_options = parse_options_t<char>;
115115
#endif
116116
#endif
117117

118+
#if defined(__SSE2__) || \
119+
(defined(FASTFLOAT_VISUAL_STUDIO) && \
120+
(defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
121+
#define FASTFLOAT_SSE2 1
122+
#endif
123+
124+
#ifdef FASTFLOAT_SSE2
125+
#define FASTFLOAT_HAS_SIMD 1
126+
#endif
127+
128+
#if defined(__GNUC__)
129+
// disable -Wcast-align=strict (GCC only)
130+
#define FASTFLOAT_SIMD_DISABLE_WARNINGS \
131+
_Pragma("GCC diagnostic push") \
132+
_Pragma("GCC diagnostic ignored \"-Wcast-align\"")
133+
#else
134+
#define FASTFLOAT_SIMD_DISABLE_WARNINGS
135+
#endif
136+
137+
#if defined(__GNUC__)
138+
#define FASTFLOAT_SIMD_RESTORE_WARNINGS \
139+
_Pragma("GCC diagnostic pop")
140+
#else
141+
#define FASTFLOAT_SIMD_RESTORE_WARNINGS
142+
#endif
143+
144+
145+
118146
#ifdef FASTFLOAT_VISUAL_STUDIO
119147
#define fastfloat_really_inline __forceinline
120148
#else
@@ -132,6 +160,9 @@ using parse_options = parse_options_t<char>;
132160
// rust style `try!()` macro, or `?` operator
133161
#define FASTFLOAT_TRY(x) { if (!(x)) return false; }
134162

163+
#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
164+
165+
135166
namespace fast_float {
136167

137168
fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {

include/fast_float/parse_number.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
166166
if (!pns.valid) {
167167
return detail::parse_infnan(first, last, value);
168168
}
169+
169170
answer.ec = std::errc(); // be optimistic
170171
answer.ptr = pns.lastmatch;
171172
// The implementation of the Clinger's fast path is convoluted because

0 commit comments

Comments
 (0)