inline void store_nt(float *data) const { SHORTVEC_ASSERT_ALIGNED(data, 64); _mm512_stream_ps(data + 0, val1); _mm512_stream_ps(data + 16, val2); }
inline void load_aligned(const float *data) { SHORTVEC_ASSERT_ALIGNED(data, 64); val1 = _mm512_load_ps(data + 0); val2 = _mm512_load_ps(data + 16); }
inline void scatter(float *ptr, const int *offsets) const { __m512i indices; SHORTVEC_ASSERT_ALIGNED(offsets, 64); indices = _mm512_load_epi32(offsets); _mm512_i32scatter_ps(ptr, indices, val1, 4); indices = _mm512_load_epi32(offsets + 16); _mm512_i32scatter_ps(ptr, indices, val2, 4); }
inline void gather(const float *ptr, const int *offsets) { __m512i indices; SHORTVEC_ASSERT_ALIGNED(offsets, 64); indices = _mm512_load_epi32(offsets); val1 = _mm512_i32gather_ps(indices, ptr, 4); indices = _mm512_load_epi32(offsets + 16); val2 = _mm512_i32gather_ps(indices, ptr, 4); }
inline void store_nt(double *data) const { SHORTVEC_ASSERT_ALIGNED(data, 64); _mm512_stream_pd(data, val); }
inline void load_aligned(const double *data) { SHORTVEC_ASSERT_ALIGNED(data, 64); val = _mm512_load_pd(data); }