Skip to content

Commit 4b8842c

Browse files
degasusserge-sans-paille
authored andcommitted
avx512f: implement slide_left and slide_right.
With a fast path for N = 4*i and a split version else (inspired from avx512bw). As the vpermd actually has the lower latency on some intel cpus compared to vpermw / vpermb, also use it instead of the avx512bw and avx512vbmi implementations if trivially possible. Also reverse the order of the slow path for lower latency. It used to be: ``` SLR -> PERM -> OR -> PERM SLL -> ``` Now the latency is reduced to: ``` SLR -> PERM -> OR SLL -> PERM -> ``` And it should now generate even better code on avx512bw with N=63 with only one PERM (as already done for N=1). For N=16,32,48, it prefers vshufi32x4 over vpermd for lower latency on Zen4 and decreased register usage.
1 parent 631294a commit 4b8842c

File tree

4 files changed

+113
-147
lines changed

4 files changed

+113
-147
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 9 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -484,113 +484,25 @@ namespace xsimd
484484
}
485485

486486
// slide_left
487-
namespace detail
488-
{
489-
struct make_slide_perm_hi
490-
{
491-
static constexpr uint64_t get(size_t i, size_t)
492-
{
493-
return i == 0 ? 8 : i - 1;
494-
}
495-
};
496-
497-
template <size_t N>
498-
struct make_slide_left_pattern
499-
{
500-
static constexpr uint16_t get(size_t i, size_t)
501-
{
502-
return i >= N ? i - N : 0;
503-
}
504-
};
505-
}
506-
507-
template <size_t N, class A, class T>
487+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
508488
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
509489
{
510-
if (N == 0)
511-
{
512-
return x;
513-
}
514-
if (N >= 64)
515-
{
516-
return batch<T, A>(T(0));
517-
}
518-
batch<T, A> xx;
519-
if (N & 1)
520-
{
521-
alignas(A::alignment()) uint64_t buffer[8];
522-
_mm512_store_epi64(&buffer[0], x);
523-
for (int i = 7; i > 0; --i)
524-
buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
525-
buffer[0] = buffer[0] << 8;
526-
xx = _mm512_load_epi64(&buffer[0]);
490+
static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
527491

528-
auto slide_perm = xsimd::make_batch_constant<uint64_t, detail::make_slide_perm_hi, A>();
529-
__m512i xl = _mm512_slli_epi64(x, 8);
530-
__m512i xr = _mm512_srli_epi64(x, 56);
531-
xr = _mm512_permutex2var_epi64(xr, slide_perm.as_batch(), _mm512_setzero_si512());
532-
xx = _mm512_or_si512(xr, xl);
533-
if (N == 1)
534-
return xx;
535-
}
536-
else
537-
{
538-
xx = x;
539-
}
540492
__mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31);
541-
auto slide_pattern = xsimd::make_batch_constant<uint16_t, detail::make_slide_left_pattern<N / 2>, A>();
542-
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx);
493+
auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_left_pattern<N / 2>, A>();
494+
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
543495
}
544496

545497
// slide_right
546-
namespace detail
547-
{
548-
struct make_slide_perm_low
549-
{
550-
static constexpr uint64_t get(size_t i, size_t)
551-
{
552-
return i + 1;
553-
}
554-
};
555-
556-
template <size_t N>
557-
struct make_slide_right_pattern
558-
{
559-
static constexpr uint16_t get(size_t i, size_t n)
560-
{
561-
return i < (n - N) ? i + N : 0;
562-
}
563-
};
564-
}
565-
template <size_t N, class A, class T>
498+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
566499
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
567500
{
568-
if (N == 0)
569-
{
570-
return x;
571-
}
572-
if (N >= 64)
573-
{
574-
return batch<T, A>(T(0));
575-
}
576-
batch<T, A> xx;
577-
if (N & 1)
578-
{
579-
auto slide_perm = xsimd::make_batch_constant<uint64_t, detail::make_slide_perm_low, A>();
580-
__m512i xr = _mm512_srli_epi64(x, 8);
581-
__m512i xl = _mm512_slli_epi64(x, 56);
582-
xl = _mm512_permutex2var_epi64(xl, slide_perm.as_batch(), _mm512_setzero_si512());
583-
xx = _mm512_or_si512(xr, xl);
584-
if (N == 1)
585-
return xx;
586-
}
587-
else
588-
{
589-
xx = x;
590-
}
501+
static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
502+
591503
__mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31);
592-
auto slide_pattern = xsimd::make_batch_constant<uint16_t, detail::make_slide_right_pattern<N / 2>, A>();
593-
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx);
504+
auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_right_pattern<N / 2>, A>();
505+
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
594506
}
595507

596508
// ssub

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,19 +1859,110 @@ namespace xsimd
18591859
}
18601860

18611861
// slide_left
1862+
namespace detail
1863+
{
1864+
template <size_t N>
1865+
struct make_slide_left_pattern
1866+
{
1867+
static constexpr size_t get(size_t i, size_t)
1868+
{
1869+
return i >= N ? i - N : 0;
1870+
}
1871+
};
1872+
1873+
template <size_t N, class A, class T>
1874+
XSIMD_INLINE batch<T, A> slide_left_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
1875+
{
1876+
static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
1877+
1878+
if (N == 0)
1879+
{
1880+
return x;
1881+
}
1882+
if (N >= 64)
1883+
{
1884+
return batch<T, A>(T(0));
1885+
}
1886+
1887+
__mmask16 mask = uint16_t(0xFFFFu << (N / 4));
1888+
1889+
if ((N & 15) == 0)
1890+
{
1891+
const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16)));
1892+
return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
1893+
}
1894+
1895+
auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_left_pattern<N / 4>, A>();
1896+
return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
1897+
}
1898+
}
1899+
18621900
template <size_t N, class A, class T>
1863-
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
1901+
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512f>) noexcept
18641902
{
1865-
static_assert(N == 0xDEAD, "not implemented yet");
1866-
return {};
1903+
constexpr size_t NN = N & ~3;
1904+
if (N == NN || NN >= 64)
1905+
{
1906+
// Call fast path
1907+
return detail::slide_left_aligned_u32<NN>(x, A {});
1908+
}
1909+
1910+
__m512i xl = detail::slide_left_aligned_u32<NN, A, T>(_mm512_slli_epi32(x, 8 * (N - NN)), A {});
1911+
__m512i xr = detail::slide_left_aligned_u32<NN + 4, A, T>(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {});
1912+
return _mm512_or_epi32(xl, xr);
18671913
}
18681914

18691915
// slide_right
1916+
namespace detail
1917+
{
1918+
template <size_t N>
1919+
struct make_slide_right_pattern
1920+
{
1921+
static constexpr size_t get(size_t i, size_t n)
1922+
{
1923+
return i < (n - N) ? i + N : 0;
1924+
}
1925+
};
1926+
1927+
template <size_t N, class A, class T>
1928+
XSIMD_INLINE batch<T, A> slide_right_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
1929+
{
1930+
static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
1931+
1932+
if (N == 0)
1933+
{
1934+
return x;
1935+
}
1936+
if (N >= 64)
1937+
{
1938+
return batch<T, A>(T(0));
1939+
}
1940+
1941+
__mmask16 mask = 0xFFFFu >> (N / 4);
1942+
1943+
if ((N & 15) == 0)
1944+
{
1945+
const uint8_t imm8 = 0xe4 >> (2 * (N / 16));
1946+
return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
1947+
}
1948+
1949+
auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_right_pattern<N / 4>, A>();
1950+
return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
1951+
}
1952+
}
18701953
template <size_t N, class A, class T>
1871-
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
1954+
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512f>) noexcept
18721955
{
1873-
static_assert(N == 0xDEAD, "not implemented yet");
1874-
return {};
1956+
constexpr size_t NN = N & ~3;
1957+
if (N == NN || NN >= 64)
1958+
{
1959+
// Call fast path
1960+
return detail::slide_right_aligned_u32<NN>(x, A {});
1961+
}
1962+
1963+
__m512i xl = detail::slide_right_aligned_u32<NN + 4, A, T>(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {});
1964+
__m512i xr = detail::slide_right_aligned_u32<NN, A, T>(_mm512_srli_epi32(x, 8 * (N - NN)), A {});
1965+
return _mm512_or_epi32(xl, xr);
18751966
}
18761967

18771968
// sqrt

include/xsimd/arch/xsimd_avx512vbmi.hpp

Lines changed: 7 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -24,59 +24,25 @@ namespace xsimd
2424
{
2525
using namespace types;
2626

27-
namespace detail
28-
{
29-
template <size_t N>
30-
struct make_slide_left_bytes_pattern
31-
{
32-
static constexpr uint8_t get(size_t i, size_t)
33-
{
34-
return i >= N ? i - N : 0;
35-
}
36-
};
37-
38-
template <size_t N>
39-
struct make_slide_right_bytes_pattern
40-
{
41-
static constexpr uint8_t get(size_t i, size_t n)
42-
{
43-
return i < (n - N) ? i + N : 0;
44-
}
45-
};
46-
}
47-
4827
// slide_left
49-
template <size_t N, class A, class T>
28+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
5029
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
5130
{
52-
if (N == 0)
53-
{
54-
return x;
55-
}
56-
if (N >= 64)
57-
{
58-
return batch<T, A>(T(0));
59-
}
31+
static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
6032

6133
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63);
62-
auto slide_pattern = xsimd::make_batch_constant<uint8_t, detail::make_slide_left_bytes_pattern<N>, A>();
34+
auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_left_pattern<N>, A>();
6335
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
6436
}
6537

6638
// slide_right
67-
template <size_t N, class A, class T>
39+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
6840
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
6941
{
70-
if (N == 0)
71-
{
72-
return x;
73-
}
74-
if (N >= 64)
75-
{
76-
return batch<T, A>(T(0));
77-
}
42+
static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
43+
7844
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63);
79-
auto slide_pattern = xsimd::make_batch_constant<uint8_t, detail::make_slide_right_bytes_pattern<N>, A>();
45+
auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_right_pattern<N>, A>();
8046
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
8147
}
8248

test/test_shuffle.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ namespace
166166
};
167167
}
168168

169-
#if !XSIMD_WITH_AVX512F || XSIMD_WITH_AVX512BW
170169
template <class B>
171170
struct slide_test : public init_slide_base<typename B::value_type, B::size>
172171
{
@@ -270,8 +269,6 @@ TEST_CASE_TEMPLATE("[slide]", B, BATCH_INT_TYPES)
270269
}
271270
}
272271

273-
#endif
274-
275272
template <class B>
276273
struct compress_test
277274
{

0 commit comments

Comments
 (0)