inline
 void store_nt(float *data) const
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     _mm512_stream_ps(data +  0, val1);
     _mm512_stream_ps(data + 16, val2);
 }
 inline
 void load_aligned(const float *data)
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     val1 = _mm512_load_ps(data +  0);
     val2 = _mm512_load_ps(data + 16);
 }
 inline
 void scatter(float *ptr, const int *offsets) const
 {
     __m512i indices;
     SHORTVEC_ASSERT_ALIGNED(offsets, 64);
     indices = _mm512_load_epi32(offsets);
     _mm512_i32scatter_ps(ptr, indices, val1, 4);
     indices = _mm512_load_epi32(offsets + 16);
     _mm512_i32scatter_ps(ptr, indices, val2, 4);
 }
 inline
 void gather(const float *ptr, const int *offsets)
 {
     __m512i indices;
     SHORTVEC_ASSERT_ALIGNED(offsets, 64);
     indices = _mm512_load_epi32(offsets);
     val1    = _mm512_i32gather_ps(indices, ptr, 4);
     indices = _mm512_load_epi32(offsets + 16);
     val2    = _mm512_i32gather_ps(indices, ptr, 4);
 }
 inline
 void store_nt(double *data) const
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     _mm512_stream_pd(data, val);
 }
 inline
 void load_aligned(const double *data)
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     val = _mm512_load_pd(data);
 }