void avx2_csr_spmv( float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y) { int32_t A_offset = 0; for(int32_t i = 0; i < n; i++) { int32_t nElem = nIdx[i]; float t = 0.0f; __m256 vT = _mm256_setzero_ps(); int32_t smLen = nElem - (nElem & 7); for(int32_t j = 0; j < smLen; j+=8) { __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][j])); __m256 vX = _mm256_i32gather_ps((float const*)x,vIdx,4); __m256 vA = _mm256_loadu_ps(&A[A_offset + j]); vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA)); } t += sum8(vT); for(int32_t j = smLen; j < nElem; j++) { int32_t idx = indices[i][j]; t += x[idx]*A[A_offset + j]; } y[i] = t; A_offset += nElem; } }
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) { // CHECK-LABEL: test_mm256_i32gather_ps // CHECK: [[CMP:%.*]] = fcmp oeq <8 x float> // CHECK-NEXT: [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32> // CHECK-NEXT: [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float> // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2) return _mm256_i32gather_ps(b, c, 2); }
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) { // CHECK-LABEL: test_mm256_i32gather_ps // CHECK: call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 0) // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2) return _mm256_i32gather_ps(b, c, 2); }
void TransLut::process_plane_flt_any_avx2 (uint8_t *dst_ptr, const uint8_t *src_ptr, int stride_dst, int stride_src, int w, int h) { assert (dst_ptr != 0); assert (src_ptr != 0); assert (stride_dst != 0 || h == 1); assert (stride_src != 0 || h == 1); assert (w > 0); assert (h > 0); for (int y = 0; y < h; ++y) { const FloatIntMix * s_ptr = reinterpret_cast <const FloatIntMix *> (src_ptr); TD * d_ptr = reinterpret_cast < TD *> (dst_ptr); for (int x = 0; x < w; x += 8) { union { __m256i _vect; uint32_t _scal [8]; } index; __m256 lerp; TransLut_FindIndexAvx2 <M>::find_index (s_ptr + x, index._vect, lerp); #if 1 // Looks as fast as _mm256_set_ps // G++ complains about sizeof() as argument __m256 val = _mm256_i32gather_ps ( &_lut.use <float> (0), index._vect, 4 // 4 == sizeof (float) ); const __m256 va2 = _mm256_i32gather_ps ( &_lut.use <float> (1), index._vect, 4 // 4 == sizeof (float) ); #else __m256 val = _mm256_set_ps ( _lut.use <float> (index._scal [7] ), _lut.use <float> (index._scal [6] ), _lut.use <float> (index._scal [5] ), _lut.use <float> (index._scal [4] ), _lut.use <float> (index._scal [3] ), _lut.use <float> (index._scal [2] ), _lut.use <float> (index._scal [1] ), _lut.use <float> (index._scal [0] ) ); const __m256 va2 = _mm256_set_ps ( _lut.use <float> (index._scal [7] + 1), _lut.use <float> (index._scal [6] + 1), _lut.use <float> (index._scal [5] + 1), _lut.use <float> (index._scal [4] + 1), _lut.use <float> (index._scal [3] + 1), _lut.use <float> (index._scal [2] + 1), _lut.use <float> (index._scal [1] + 1), _lut.use <float> (index._scal [0] + 1) ); #endif const __m256 dif = _mm256_sub_ps (va2, val); val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp)); TransLut_store_avx2 (&d_ptr [x], val); } src_ptr += stride_src; dst_ptr += stride_dst; } _mm256_zeroupper (); // Back to SSE state }
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) { // CHECK: @llvm.x86.avx2.gather.d.ps.256 return _mm256_i32gather_ps(b, c, 2); }
void test (const float *x) { __m256i i = _mm256_set1_epi32 (1); __m256 d = _mm256_i32gather_ps (x, i, 1); }
void extern avx2_test (void) { x = _mm256_i32gather_ps (base, idx, 1); }