コード例 #1
0
ファイル: csr.cpp プロジェクト: dsheffie/Toys
void avx2_csr_spmv( float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y)
{
  int32_t A_offset = 0;
  for(int32_t i = 0; i < n; i++)
    {
      int32_t nElem = nIdx[i]; 
      float t = 0.0f;
      
      __m256 vT = _mm256_setzero_ps();
      int32_t smLen = nElem - (nElem & 7);

      for(int32_t j = 0; j < smLen; j+=8)
	{
	  __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][j]));
	  __m256 vX = _mm256_i32gather_ps((float const*)x,vIdx,4);
	  __m256 vA = _mm256_loadu_ps(&A[A_offset + j]);
	  vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA));

	}

      t += sum8(vT);
      for(int32_t j = smLen; j < nElem; j++)
	{
	  int32_t idx = indices[i][j];
	  t += x[idx]*A[A_offset + j];
	}

      y[i] = t;
      A_offset += nElem;
    }
} 
コード例 #2
0
ファイル: avx2-builtins.c プロジェクト: CSI-LLVM/clang
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
  // CHECK-LABEL: test_mm256_i32gather_ps
  // CHECK:         [[CMP:%.*]] = fcmp oeq <8 x float>
  // CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
  // CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float>
  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
  return _mm256_i32gather_ps(b, c, 2);
}
コード例 #3
0
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
  // CHECK-LABEL: test_mm256_i32gather_ps
  // CHECK: call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 0)
  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
  return _mm256_i32gather_ps(b, c, 2);
}
コード例 #4
0
void	TransLut::process_plane_flt_any_avx2 (uint8_t *dst_ptr, const uint8_t *src_ptr, int stride_dst, int stride_src, int w, int h)
{
	assert (dst_ptr != 0);
	assert (src_ptr != 0);
	assert (stride_dst != 0 || h == 1);
	assert (stride_src != 0 || h == 1);
	assert (w > 0);
	assert (h > 0);

	for (int y = 0; y < h; ++y)
	{
		const FloatIntMix *  s_ptr =
			reinterpret_cast <const FloatIntMix *> (src_ptr);
		TD *                 d_ptr =
			reinterpret_cast <               TD *> (dst_ptr);

		for (int x = 0; x < w; x += 8)
		{
			union
			{
				__m256i            _vect;
				uint32_t           _scal [8];
			}                  index;
			__m256             lerp;
			TransLut_FindIndexAvx2 <M>::find_index (s_ptr + x, index._vect, lerp);
#if 1	// Looks as fast as _mm256_set_ps
			// G++ complains about sizeof() as argument
			__m256             val = _mm256_i32gather_ps (
				&_lut.use <float> (0), index._vect, 4  // 4 == sizeof (float)
			);
			const __m256       va2 = _mm256_i32gather_ps (
				&_lut.use <float> (1), index._vect, 4  // 4 == sizeof (float)
			);
#else
			__m256             val = _mm256_set_ps (
				_lut.use <float> (index._scal [7]    ),
				_lut.use <float> (index._scal [6]    ),
				_lut.use <float> (index._scal [5]    ),
				_lut.use <float> (index._scal [4]    ),
				_lut.use <float> (index._scal [3]    ),
				_lut.use <float> (index._scal [2]    ),
				_lut.use <float> (index._scal [1]    ),
				_lut.use <float> (index._scal [0]    )
			);
			const __m256       va2 = _mm256_set_ps (
				_lut.use <float> (index._scal [7] + 1),
				_lut.use <float> (index._scal [6] + 1),
				_lut.use <float> (index._scal [5] + 1),
				_lut.use <float> (index._scal [4] + 1),
				_lut.use <float> (index._scal [3] + 1),
				_lut.use <float> (index._scal [2] + 1),
				_lut.use <float> (index._scal [1] + 1),
				_lut.use <float> (index._scal [0] + 1)
			);
#endif
			const __m256       dif = _mm256_sub_ps (va2, val);
			val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp));
			TransLut_store_avx2 (&d_ptr [x], val);
		}

		src_ptr += stride_src;
		dst_ptr += stride_dst;
	}

	_mm256_zeroupper ();	// Back to SSE state
}
コード例 #5
0
ファイル: avx2-builtins.c プロジェクト: Bigcheese/clang
__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
  // CHECK: @llvm.x86.avx2.gather.d.ps.256
  return _mm256_i32gather_ps(b, c, 2);
}
コード例 #6
0
ファイル: pr59839.c プロジェクト: 0day-ci/gcc
void
test (const float *x)
{
  __m256i i = _mm256_set1_epi32 (1);
  __m256 d = _mm256_i32gather_ps (x, i, 1);
}
コード例 #7
0
void extern
avx2_test (void)
{
  x = _mm256_i32gather_ps (base, idx, 1);
}