Skip to content

Commit b2d746f

Browse files
committed
Use generic SIMD intrinsics for AVX maskload and maskstore intrinsics
1 parent 05672d8 commit b2d746f

File tree

2 files changed

+32
-48
lines changed

2 files changed

+32
-48
lines changed

crates/core_arch/src/x86/avx.rs

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1675,7 +1675,8 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
16751675
#[cfg_attr(test, assert_instr(vmaskmovpd))]
16761676
#[stable(feature = "simd_x86", since = "1.27.0")]
16771677
pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1678-
maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1678+
let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1679+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
16791680
}
16801681

16811682
/// Stores packed double-precision (64-bit) floating-point elements from `a`
@@ -1687,7 +1688,8 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d
16871688
#[cfg_attr(test, assert_instr(vmaskmovpd))]
16881689
#[stable(feature = "simd_x86", since = "1.27.0")]
16891690
pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1690-
maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1691+
let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1692+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
16911693
}
16921694

16931695
/// Loads packed double-precision (64-bit) floating-point elements from memory
@@ -1700,7 +1702,8 @@ pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d)
17001702
#[cfg_attr(test, assert_instr(vmaskmovpd))]
17011703
#[stable(feature = "simd_x86", since = "1.27.0")]
17021704
pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1703-
maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1705+
let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1706+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
17041707
}
17051708

17061709
/// Stores packed double-precision (64-bit) floating-point elements from `a`
@@ -1712,7 +1715,8 @@ pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
17121715
#[cfg_attr(test, assert_instr(vmaskmovpd))]
17131716
#[stable(feature = "simd_x86", since = "1.27.0")]
17141717
pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1715-
maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1718+
let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1719+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
17161720
}
17171721

17181722
/// Loads packed single-precision (32-bit) floating-point elements from memory
@@ -1725,7 +1729,8 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
17251729
#[cfg_attr(test, assert_instr(vmaskmovps))]
17261730
#[stable(feature = "simd_x86", since = "1.27.0")]
17271731
pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1728-
maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1732+
let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1733+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
17291734
}
17301735

17311736
/// Stores packed single-precision (32-bit) floating-point elements from `a`
@@ -1737,7 +1742,8 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256
17371742
#[cfg_attr(test, assert_instr(vmaskmovps))]
17381743
#[stable(feature = "simd_x86", since = "1.27.0")]
17391744
pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1740-
maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1745+
let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1746+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
17411747
}
17421748

17431749
/// Loads packed single-precision (32-bit) floating-point elements from memory
@@ -1750,7 +1756,8 @@ pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256)
17501756
#[cfg_attr(test, assert_instr(vmaskmovps))]
17511757
#[stable(feature = "simd_x86", since = "1.27.0")]
17521758
pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1753-
maskloadps(mem_addr as *const i8, mask.as_i32x4())
1759+
let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1760+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
17541761
}
17551762

17561763
/// Stores packed single-precision (32-bit) floating-point elements from `a`
@@ -1762,7 +1769,8 @@ pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
17621769
#[cfg_attr(test, assert_instr(vmaskmovps))]
17631770
#[stable(feature = "simd_x86", since = "1.27.0")]
17641771
pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1765-
maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1772+
let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1773+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
17661774
}
17671775

17681776
/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
@@ -3147,22 +3155,6 @@ unsafe extern "C" {
31473155
fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
31483156
#[link_name = "llvm.x86.avx.vpermilvar.pd"]
31493157
fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3150-
#[link_name = "llvm.x86.avx.maskload.pd.256"]
3151-
fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3152-
#[link_name = "llvm.x86.avx.maskstore.pd.256"]
3153-
fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3154-
#[link_name = "llvm.x86.avx.maskload.pd"]
3155-
fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3156-
#[link_name = "llvm.x86.avx.maskstore.pd"]
3157-
fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3158-
#[link_name = "llvm.x86.avx.maskload.ps.256"]
3159-
fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3160-
#[link_name = "llvm.x86.avx.maskstore.ps.256"]
3161-
fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3162-
#[link_name = "llvm.x86.avx.maskload.ps"]
3163-
fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3164-
#[link_name = "llvm.x86.avx.maskstore.ps"]
3165-
fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
31663158
#[link_name = "llvm.x86.avx.ldu.dq.256"]
31673159
fn vlddqu(mem_addr: *const i8) -> i8x32;
31683160
#[link_name = "llvm.x86.avx.rcp.ps.256"]

crates/core_arch/src/x86/avx2.rs

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,7 +1786,8 @@ pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
17861786
#[cfg_attr(test, assert_instr(vpmaskmovd))]
17871787
#[stable(feature = "simd_x86", since = "1.27.0")]
17881788
pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1789-
transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1789+
let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1790+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
17901791
}
17911792

17921793
/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
@@ -1799,7 +1800,8 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i
17991800
#[cfg_attr(test, assert_instr(vpmaskmovd))]
18001801
#[stable(feature = "simd_x86", since = "1.27.0")]
18011802
pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1802-
transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1803+
let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1804+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
18031805
}
18041806

18051807
/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -1812,7 +1814,8 @@ pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m2
18121814
#[cfg_attr(test, assert_instr(vpmaskmovq))]
18131815
#[stable(feature = "simd_x86", since = "1.27.0")]
18141816
pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1815-
transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1817+
let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1818+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
18161819
}
18171820

18181821
/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
@@ -1825,7 +1828,8 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i
18251828
#[cfg_attr(test, assert_instr(vpmaskmovq))]
18261829
#[stable(feature = "simd_x86", since = "1.27.0")]
18271830
pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1828-
transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1831+
let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1832+
simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
18291833
}
18301834

18311835
/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -1838,7 +1842,8 @@ pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m2
18381842
#[cfg_attr(test, assert_instr(vpmaskmovd))]
18391843
#[stable(feature = "simd_x86", since = "1.27.0")]
18401844
pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1841-
maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1845+
let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1846+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
18421847
}
18431848

18441849
/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
@@ -1851,7 +1856,8 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i)
18511856
#[cfg_attr(test, assert_instr(vpmaskmovd))]
18521857
#[stable(feature = "simd_x86", since = "1.27.0")]
18531858
pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1854-
maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1859+
let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1860+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
18551861
}
18561862

18571863
/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -1864,7 +1870,8 @@ pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m25
18641870
#[cfg_attr(test, assert_instr(vpmaskmovq))]
18651871
#[stable(feature = "simd_x86", since = "1.27.0")]
18661872
pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1867-
maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1873+
let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1874+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
18681875
}
18691876

18701877
/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
@@ -1877,7 +1884,8 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i)
18771884
#[cfg_attr(test, assert_instr(vpmaskmovq))]
18781885
#[stable(feature = "simd_x86", since = "1.27.0")]
18791886
pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1880-
maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1887+
let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1888+
simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
18811889
}
18821890

18831891
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3645,22 +3653,6 @@ unsafe extern "C" {
36453653
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
36463654
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
36473655
fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3648-
#[link_name = "llvm.x86.avx2.maskload.d"]
3649-
fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3650-
#[link_name = "llvm.x86.avx2.maskload.d.256"]
3651-
fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3652-
#[link_name = "llvm.x86.avx2.maskload.q"]
3653-
fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3654-
#[link_name = "llvm.x86.avx2.maskload.q.256"]
3655-
fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3656-
#[link_name = "llvm.x86.avx2.maskstore.d"]
3657-
fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3658-
#[link_name = "llvm.x86.avx2.maskstore.d.256"]
3659-
fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3660-
#[link_name = "llvm.x86.avx2.maskstore.q"]
3661-
fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3662-
#[link_name = "llvm.x86.avx2.maskstore.q.256"]
3663-
fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
36643656
#[link_name = "llvm.x86.avx2.mpsadbw"]
36653657
fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
36663658
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]

0 commit comments

Comments
 (0)