示例#1
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) {
    __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x);
    __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y);
    __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    _mm256_store_ps(accI, aix);
    _mm256_store_ps(accI + 8, aiy);
    _mm256_store_ps(accI + 16, aiz);

}
示例#2
0
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
示例#3
0
double compute_pi_leibniz_avx_opt_single(size_t n)
{
	double pi = 0.0;
	register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
	register __m256 ymm9, ymm10, ymm11, ymm12, ymm13;

	ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
	ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
	ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0);
	ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0);
	ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0);
	ymm13 = _mm256_set1_ps(64.0);

	ymm5 = _mm256_setzero_ps();
	ymm6 = _mm256_setzero_ps();
	ymm7 = _mm256_setzero_ps();
	ymm8 = _mm256_setzero_ps();
	
	for (int i = 0; i <= n - 32; i += 32) {
		ymm9 = _mm256_div_ps(ymm0, ymm1);
		ymm1 = _mm256_add_ps(ymm1, ymm13);
		ymm10 = _mm256_div_ps(ymm0, ymm2);
		ymm2 = _mm256_add_ps(ymm2, ymm13);
		ymm11 = _mm256_div_ps(ymm0, ymm3);
		ymm3 = _mm256_add_ps(ymm3, ymm13);
		ymm12 = _mm256_div_ps(ymm0, ymm4);
		ymm4 = _mm256_add_ps(ymm4, ymm13);

		ymm5 = _mm256_add_ps(ymm5, ymm9);
		ymm6 = _mm256_add_ps(ymm6, ymm10);
		ymm7 = _mm256_add_ps(ymm7, ymm11);
		ymm8 = _mm256_add_ps(ymm8, ymm12);
	}
	float tmp[8] __attribute__((aligned(32)));
	_mm256_store_ps(tmp, ymm5);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm6);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm7);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];
	_mm256_store_ps(tmp, ymm8);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \
	      tmp[4] + tmp[5] + tmp[6] + tmp[7];

	return pi * 4.0;
}
示例#4
0
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl)
{
	int zigzag;

	__m256 result, dct_values, quant_values;
	__m256 factor = _mm256_set1_ps(0.25f);

	for (zigzag = 0; zigzag < 64; zigzag += 8)
	{
		// Set the dct_values for the current interation
		dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]],
				in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]],
				in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]],
				in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]);

		// Multiply with 0.25 to divide by 4.0
		result = _mm256_mul_ps(dct_values, factor);

		// Load quant-values and multiply with previous product
		quant_values = _mm256_load_ps(quant_tbl + zigzag);
		result = _mm256_div_ps(result, quant_values);

		// Round off values and store in out_data buffer
		result = c63_mm256_roundhalfawayfromzero_ps(result);
		_mm256_store_ps(out_data + zigzag, result);
	}
}
示例#5
0
void
mandel_avx(unsigned char *image, const struct spec *s)
{
    __m256 xmin = _mm256_set1_ps(s->xlim[0]);
    __m256 ymin = _mm256_set1_ps(s->ylim[0]);
    __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width);
    __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height);
    __m256 threshold = _mm256_set1_ps(4);
    __m256 one = _mm256_set1_ps(1);
    __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations);
    __m256 depth_scale = _mm256_set1_ps(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 8) {
            __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4,
                                      x + 3, x + 2, x + 1, x + 0);
            __m256 my = _mm256_set1_ps(y);
            __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin);
            __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin);
            __m256 zr = cr;
            __m256 zi = ci;
            int k = 1;
            __m256 mk = _mm256_set1_ps(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m256 zr2 = _mm256_mul_ps(zr, zr);
                __m256 zi2 = _mm256_mul_ps(zi, zi);
                __m256 zrzi = _mm256_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr);
                zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm256_mul_ps(zr, zr);
                zi2 = _mm256_mul_ps(zi, zi);
                __m256 mag2 = _mm256_add_ps(zr2, zi2);
                __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS);
                mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk);

                /* Early bailout? */
                if (_mm256_testz_ps(mask, _mm256_set1_ps(-1)))
                    break;
            }
            mk = _mm256_mul_ps(mk, iter_scale);
            mk = _mm256_sqrt_ps(mk);
            mk = _mm256_mul_ps(mk, depth_scale);
            __m256i pixels = _mm256_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 8; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
示例#6
0
void static
avx_test (void)
{
  int i;
  union256 u, s1, s2;
  float e[8];

  s1.x = _mm256_set_ps (24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4);
  s2.x = _mm256_set_ps (1.17, 2.16, 3.15, 4.14, 5.13, 6.12, 7.11, 8.9);
  u.x = _mm256_div_ps (s1.x, s2.x);

  for (i = 0; i < 8; i++)
    e[i] = s1.a[i] / s2.a[i];

  if (check_union256 (u, e))
    abort ();
}
示例#7
0
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) {
    __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x);
    __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y);
    __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z);

    __m256 pjx = _mm256_set1_ps(posJ.x);
    __m256 pjy = _mm256_set1_ps(posJ.y);
    __m256 pjz = _mm256_set1_ps(posJ.z);

    __m256 rx = _mm256_sub_ps(pjx, pix);
    __m256 ry = _mm256_sub_ps(pjy, piy);
    __m256 rz = _mm256_sub_ps(pjz, piz);

    __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2);

    __m256 rx2 = _mm256_mul_ps(rx, rx);
    __m256 ry2 = _mm256_mul_ps(ry, ry);
    __m256 rz2 = _mm256_mul_ps(rz, rz);
    __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2)));

    __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale));
    __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2);

    for (int i = 0; i < 8; i++) {
        if (close.m256_f32[i] == 0) {
            numNeighbours[7 - i] = 0;
        }
    }

    __m256 m = _mm256_set1_ps(massJ);
    __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs));

    __m256 aix = _mm256_mul_ps(rx, rabsInv);
    __m256 aiy = _mm256_mul_ps(ry, rabsInv);
    __m256 aiz = _mm256_mul_ps(rz, rabsInv);

    for (int i = 0; i < 8; i++) {
        accI[7 - i].x = aix.m256_f32[i];
        accI[7 - i].y = aiy.m256_f32[i];
        accI[7 - i].z = aiz.m256_f32[i];
    }

}
示例#8
0
/* AVX implementation for Minimum Mean Squared Error (MMSE) solver */
inline void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11,
                                        __m256 *x0, __m256 *x1, float noise_estimate, float norm) {
  __m256 _noise_estimate = _mm256_set_ps(0.0f, noise_estimate, 0.0f, noise_estimate,
                                         0.0f, noise_estimate, 0.0f, noise_estimate);
  __m256 _norm = _mm256_set1_ps(norm);

  /* Create conjugated matrix */
  __m256 _h00 = _MM256_CONJ_PS(h00);
  __m256 _h01 = _MM256_CONJ_PS(h01);
  __m256 _h10 = _MM256_CONJ_PS(h10);
  __m256 _h11 = _MM256_CONJ_PS(h11);

  /* 1. A = H' x H + No*/
#ifdef LV_HAVE_FMA
  __m256 a00 = _MM256_SQMOD_ADD_PS(h00, h10, _noise_estimate);
  __m256 a01 = _MM256_PROD_ADD_PS(_h00, h01, _MM256_PROD_PS(_h10, h11));
  __m256 a10 = _MM256_PROD_ADD_PS(_h01, h00, _MM256_PROD_PS(_h11, h10));
  __m256 a11 = _MM256_SQMOD_ADD_PS(h01, h11, _noise_estimate);
#else
  __m256 a00 = _mm256_add_ps(_MM256_SQMOD_PS(h00, h10), _noise_estimate);
  __m256 a01 = _mm256_add_ps(_MM256_PROD_PS(_h00, h01), _MM256_PROD_PS(_h10, h11));
  __m256 a10 = _mm256_add_ps(_MM256_PROD_PS(_h01, h00), _MM256_PROD_PS(_h11, h10));
  __m256 a11 = _mm256_add_ps(_MM256_SQMOD_PS(h01, h11), _noise_estimate);
#endif /* LV_HAVE_FMA */

  /* 2. B = inv(H' x H + No) = inv(A) */
  __m256 b00 = a11;
  __m256 b01 = _mm256_xor_ps(a01, _mm256_set1_ps(-0.0f));
  __m256 b10 = _mm256_xor_ps(a10, _mm256_set1_ps(-0.0f));
  __m256 b11 = a00;
  _norm = _mm256_mul_ps(_norm, srslte_mat_cf_recip_avx(srslte_mat_2x2_det_avx(a00, a01, a10, a11)));


  /* 3. W = inv(H' x H + No) x H' = B x H' */
#ifdef LV_HAVE_FMA
  __m256 w00 = _MM256_PROD_ADD_PS(b00, _h00, _MM256_PROD_PS(b01, _h01));
  __m256 w01 = _MM256_PROD_ADD_PS(b00, _h10, _MM256_PROD_PS(b01, _h11));
  __m256 w10 = _MM256_PROD_ADD_PS(b10, _h00, _MM256_PROD_PS(b11, _h01));
  __m256 w11 = _MM256_PROD_ADD_PS(b10, _h10, _MM256_PROD_PS(b11, _h11));
#else
  __m256 w00 = _mm256_add_ps(_MM256_PROD_PS(b00, _h00), _MM256_PROD_PS(b01, _h01));
  __m256 w01 = _mm256_add_ps(_MM256_PROD_PS(b00, _h10), _MM256_PROD_PS(b01, _h11));
  __m256 w10 = _mm256_add_ps(_MM256_PROD_PS(b10, _h00), _MM256_PROD_PS(b11, _h01));
  __m256 w11 = _mm256_add_ps(_MM256_PROD_PS(b10, _h10), _MM256_PROD_PS(b11, _h11));
#endif /* LV_HAVE_FMA */

  /* 4. X = W x Y */
#ifdef LV_HAVE_FMA
  *x0 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w00, _MM256_PROD_PS(y1, w01)), _norm);
  *x1 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w10, _MM256_PROD_PS(y1, w11)), _norm);
#else
  *x0 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w00), _MM256_PROD_PS(y1, w01)), _norm);
  *x1 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w10), _MM256_PROD_PS(y1, w11)), _norm);
#endif /* LV_HAVE_FMA */
}
示例#9
0
文件: csr.cpp 项目: dsheffie/Toys
void avx2_masked_csr_spmv(float *A, int32_t *nIdx, int32_t **indices, 
			  float *x, int32_t n, float *y)
{
  int32_t A_offset = 0;
  __m256 bitMasks[9];
  unsigned u = 0x80000000; 
  float v = *((float*)&u);

  bitMasks[0] = _mm256_set_ps(0,0,0,0,0,0,0,0); 
  bitMasks[1] = _mm256_set_ps(0,0,0,0,0,0,0,v); 
  bitMasks[2] = _mm256_set_ps(0,0,0,0,0,0,v,v); 
  bitMasks[3] = _mm256_set_ps(0,0,0,0,0,v,v,v); 
  bitMasks[4] = _mm256_set_ps(0,0,0,0,v,v,v,v); 
  bitMasks[5] = _mm256_set_ps(0,0,0,v,v,v,v,v); 
  bitMasks[6] = _mm256_set_ps(0,0,v,v,v,v,v,v); 
  bitMasks[7] = _mm256_set_ps(0,v,v,v,v,v,v,v); 
  bitMasks[8] = _mm256_set_ps(v,v,v,v,v,v,v,v); 
  
  const __m256 vZeros = _mm256_setzero_ps();

  for(int32_t i = 0; i < n; i++)
    {
      int32_t nElem = nIdx[i]; 
      float t = 0.0f;
      __m256 vT = _mm256_setzero_ps();

      int32_t k = 0; 
      while(k < nElem)
	{
	  int vl = ((k+8) < nElem) ? 8 : (nElem - k);
	  __m256 mask = bitMasks[vl];
	  /* this is padded out */
	  __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][k]));
	  __m256 vX = _mm256_mask_i32gather_ps(vZeros,(float const*)x,vIdx,mask,4);

	  __m256 vA = _mm256_loadu_ps(&A[A_offset + k]);
	  vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA));
	  k += vl;
	}
      t += sum8(vT);
      y[i] = t;
      A_offset += nElem;
    }
} 
示例#10
0
void static
avx_test (void)
{
  union256 u;
  float e [8] __attribute__ ((aligned (32))) = {0.0};

  u.x = _mm256_set_ps (1.17, 24567.16, 3.15, 4567.14, 5.13, 65467.12, 788.11, 8.9);

  test (e, u.x);

  if (check_union256 (u, e))
    abort ();
}
示例#11
0
void static
avx_test (void)
{
  union256 u, s1, s2;
  float e [8];

  s1.x = _mm256_set_ps (1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8);
  s2.x = _mm256_set_ps (2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8);
  u.x = _mm256_shuffle_ps (s1.x, s2.x, MASK);


  e[0] = select4(s1.a,   (MASK >> 0) & 0x3);
  e[1] = select4(s1.a,   (MASK >> 2) & 0x3);
  e[2] = select4(s2.a,   (MASK >> 4) & 0x3);
  e[3] = select4(s2.a,   (MASK >> 6) & 0x3);
  e[4] = select4(s1.a+4, (MASK >> 0) & 0x3);
  e[5] = select4(s1.a+4, (MASK >> 2) & 0x3);
  e[6] = select4(s2.a+4, (MASK >> 4) & 0x3);
  e[7] = select4(s2.a+4, (MASK >> 6) & 0x3);

  if (check_union256 (u, e))
    abort ();
}
示例#12
0
文件: AVX.c 项目: smfreegard/rspamd
void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_ps(y+i);
    YMM1 = _mm256_loadu_ps(x+i);
    YMM2 = _mm256_mul_ps(YMM0, YMM15);
    YMM3 = _mm256_add_ps(YMM1, YMM2);
    _mm256_storeu_ps(z+i, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] + y[i] * c;
  }
}
示例#13
0
文件: AVX.c 项目: smfreegard/rspamd
void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  ptrdiff_t off;
  __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  for (i=0; i<=((n)-32); i+=32) {
    _mm256_storeu_ps((x)+i  , YMM0);
    _mm256_storeu_ps((x)+i+8, YMM0);
    _mm256_storeu_ps((x)+i+16, YMM0);
    _mm256_storeu_ps((x)+i+24, YMM0);
  }
  off = (n) - ((n)%32);
  for (i=0; i<((n)%32); i++) {
    x[off+i] = c;
  }
}
示例#14
0
文件: AVX.c 项目: smfreegard/rspamd
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM0 = _mm256_add_ps(YMM0, YMM15);
    YMM1 = _mm256_add_ps(YMM1, YMM15);
    _mm256_storeu_ps(y+i, YMM0);
    _mm256_storeu_ps(y+i+8, YMM1);
  }
  for (; i<(n); i++) {
    y[i] = x[i] + c;
  }
}
示例#15
0
void static
avx_test (void)
{
  int i;
  union256 u, s1;
  float e[8];

  s1.x = _mm256_set_ps (134.3, 1234.54, 45.335, 646.456, 43.54, 473.34, 78, 89.54);
  u.x = _mm256_movehdup_ps (s1.x);

  for (i = 0; i < 4; i++)
    e[2*i] = e[2*i+1] = s1.a[2*i+1];

  if (check_union256 (u, e))
    abort ();
}
示例#16
0
void static
avx_test (void)
{
  int i;
  union256 s1;
  union256i_d u;
  int e [8];

  s1.x = _mm256_set_ps (45.64, 4564.56, 2.3, 5.5, 57.57, 89.34, 54.12, 954.67);
  u.x = _mm256_cvttps_epi32 (s1.x);

  for (i = 0; i < 8; i++)
    e[i] = (int)s1.a[i];

  if (check_union256i_d (u, e))
    abort ();
}
示例#17
0
文件: main.cpp 项目: sclc/DPP
//-------------------------------------------------------------------
// effect
static void
effect(float *pBlu, float *pGrn, float *pRed, const size_t height, const size_t width)
{
    __m256 zeroPs = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0);
    float *b = pBlu;
    float *r = pRed;

    for (size_t y = 0; y < height; y++)
    {
        for (size_t x = 0; x < width / 8; x++, b += 8, r += 8)
        {
            _mm256_store_ps(b, zeroPs); // B
                                        // G, skip
            _mm256_store_ps(r, zeroPs); // R
        }
    }
}
示例#18
0
void warmup(float *x, float *y, int size, float alpha)
{
    #pragma ivdep
    int i;

    __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0);
    #pragma vector aligned
    for (i=0; i<size; i+=4)
    {
        __m256 t = _mm256_load_ps(x+2*i);
        __m256 l = _mm256_mul_ps(t, m); // premultiply
        __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits
        __m256 res = _mm256_hadd_ps(l, r);
        __m128 s = _mm256_extractf128_ps (res, 0);
        _mm_store_ps(y+i,s); // store it
    }
}
示例#19
0
文件: main.cpp 项目: CCJY/coliru
matrix_t matrixMulFMA(const matrix_t & matrixA, const matrix_t & matrixB) {
	auto dimension = matrixA.size();

	assert(matrixA.size() == dimension);
	assert(matrixA[0].size() == dimension);
	assert(matrixB.size() == dimension);
	assert(matrixB[0].size() == dimension);

	matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix

	const int vec = 8;
	int start{0};

	if(dimension > vec) {
		start = dimension - dimension % vec;
		for(int x{0}; x < dimension; ++x)
		for(int i{0}; i < dimension; ++i) {
			const __m256 a = _mm256_set_ps(matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i]);// unaligned read
			for(int y{0}; y < dimension - vec; y += vec) {
				//__m256 c = _mm256_set_ps(matrixC[x][y+7], matrixC[x][y+6], matrixC[x][y+5], matrixC[x][y+4], matrixC[x][y+3], matrixC[x][y+2], matrixC[x][y+1], matrixC[x][y+0]);
				//const __m256 b = _mm256_set_ps(matrixB[i][y+7], matrixB[i][y+6], matrixB[i][y+5], matrixB[i][y+4], matrixB[i][y+3], matrixB[i][y+2], matrixB[i][y+1], matrixB[i][y+0]);

				__m256 c = *reinterpret_cast<__m256*>(&matrixC[x][y]);// aligned read
				const __m256 & b = *reinterpret_cast<const __m256*>(&matrixB[i][y]);// aligned read

				c = _mm256_fmadd_ps(a, b, c);//c = a * b + c;
				//_mm256_store_ps(&matrixC[x][y], c);//aligned
				//_mm256_storeu_ps(&matrixC[x][y], c);//unaligned

				/*
				float c[8];
				c[0] = matrixC[i][y+0];
				c[1] = matrixC[i][y+1];
				c[2] = matrixC[i][y+2];
				c[3] = matrixC[i][y+3];
				c[4] = matrixC[i][y+4];
				c[5] = matrixC[i][y+5];
				c[6] = matrixC[i][y+6];
				c[7] = matrixC[i][y+7];

				c[0] += matrixA[x][i] * matrixB[i][y+0];
				c[1] += matrixA[x][i] * matrixB[i][y+1];
				c[2] += matrixA[x][i] * matrixB[i][y+2];
				c[3] += matrixA[x][i] * matrixB[i][y+3];
				c[4] += matrixA[x][i] * matrixB[i][y+4];
				c[5] += matrixA[x][i] * matrixB[i][y+5];
				c[6] += matrixA[x][i] * matrixB[i][y+6];
				c[7] += matrixA[x][i] * matrixB[i][y+7];
				//*/
				//*
				matrixC[x][y+0] = c[0];
				matrixC[x][y+1] = c[1];
				matrixC[x][y+2] = c[2];
				matrixC[x][y+3] = c[3];
				matrixC[x][y+4] = c[4];
				matrixC[x][y+5] = c[5];
				matrixC[x][y+6] = c[6];
				matrixC[x][y+7] = c[7];
				//*/

				/*is doing this
				matrixC[x][y+0] += matrixA[x][i] * matrixB[i][y+0];
				matrixC[x][y+1] += matrixA[x][i] * matrixB[i][y+1];
				matrixC[x][y+2] += matrixA[x][i] * matrixB[i][y+2];
				matrixC[x][y+3] += matrixA[x][i] * matrixB[i][y+3];
				matrixC[x][y+4] += matrixA[x][i] * matrixB[i][y+4];
				matrixC[x][y+5] += matrixA[x][i] * matrixB[i][y+5];
				matrixC[x][y+6] += matrixA[x][i] * matrixB[i][y+6];
				matrixC[x][y+7] += matrixA[x][i] * matrixB[i][y+7];
				//*/
			}
		}
	}

	//calculate remaining columns
	for(int x{0}; x < dimension; ++x)
	for(int i{0}; i < dimension; ++i)
	for(int y{start}; y < dimension; ++y)
		matrixC[x][y] += matrixA[x][i] * matrixB[i][y];

	return matrixC;//move semantics ftw
}
示例#20
0
void	TransLut::process_plane_flt_any_avx2 (uint8_t *dst_ptr, const uint8_t *src_ptr, int stride_dst, int stride_src, int w, int h)
{
	assert (dst_ptr != 0);
	assert (src_ptr != 0);
	assert (stride_dst != 0 || h == 1);
	assert (stride_src != 0 || h == 1);
	assert (w > 0);
	assert (h > 0);

	for (int y = 0; y < h; ++y)
	{
		const FloatIntMix *  s_ptr =
			reinterpret_cast <const FloatIntMix *> (src_ptr);
		TD *                 d_ptr =
			reinterpret_cast <               TD *> (dst_ptr);

		for (int x = 0; x < w; x += 8)
		{
			union
			{
				__m256i            _vect;
				uint32_t           _scal [8];
			}                  index;
			__m256             lerp;
			TransLut_FindIndexAvx2 <M>::find_index (s_ptr + x, index._vect, lerp);
#if 1	// Looks as fast as _mm256_set_ps
			// G++ complains about sizeof() as argument
			__m256             val = _mm256_i32gather_ps (
				&_lut.use <float> (0), index._vect, 4  // 4 == sizeof (float)
			);
			const __m256       va2 = _mm256_i32gather_ps (
				&_lut.use <float> (1), index._vect, 4  // 4 == sizeof (float)
			);
#else
			__m256             val = _mm256_set_ps (
				_lut.use <float> (index._scal [7]    ),
				_lut.use <float> (index._scal [6]    ),
				_lut.use <float> (index._scal [5]    ),
				_lut.use <float> (index._scal [4]    ),
				_lut.use <float> (index._scal [3]    ),
				_lut.use <float> (index._scal [2]    ),
				_lut.use <float> (index._scal [1]    ),
				_lut.use <float> (index._scal [0]    )
			);
			const __m256       va2 = _mm256_set_ps (
				_lut.use <float> (index._scal [7] + 1),
				_lut.use <float> (index._scal [6] + 1),
				_lut.use <float> (index._scal [5] + 1),
				_lut.use <float> (index._scal [4] + 1),
				_lut.use <float> (index._scal [3] + 1),
				_lut.use <float> (index._scal [2] + 1),
				_lut.use <float> (index._scal [1] + 1),
				_lut.use <float> (index._scal [0] + 1)
			);
#endif
			const __m256       dif = _mm256_sub_ps (va2, val);
			val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp));
			TransLut_store_avx2 (&d_ptr [x], val);
		}

		src_ptr += stride_src;
		dst_ptr += stride_dst;
	}

	_mm256_zeroupper ();	// Back to SSE state
}
示例#21
0
foo (float *v)
{
  return _mm256_set_ps (v[7], v[6], v[5], v[4],
			v[3], v[2], v[1], v[0]);
}
示例#22
0
foo (float x)
{
  return _mm256_set_ps (x, x, x, x, x, x, x, x);
}
示例#23
0
void sLine_onMessage(HvBase *_c, SignalLine *o, int letIn,
  const HvMessage * const m, void *sendMessage) {
  if (msg_isFloat(m,0)) {
    if (msg_isFloat(m,1)) {
      // new ramp
      int n = ctx_millisecondsToSamples(_c, msg_getFloat(m,1));
#if HV_SIMD_AVX
      float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; // current output value
      float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample
      o->n = _mm_set_epi32(n-3, n-2, n-1, n);
      o->x = _mm256_set_ps(x+7.0f*s, x+6.0f*s, x+5.0f*s, x+4.0f*s, x+3.0f*s, x+2.0f*s, x+s, x);
      o->m = _mm256_set1_ps(8.0f*s);
      o->t = _mm256_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_SSE
      float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
      float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample
      o->n = _mm_set_epi32(n-3, n-2, n-1, n);
      o->x = _mm_set_ps(x+3.0f*s, x+2.0f*s, x+s, x);
      o->m = _mm_set1_ps(4.0f*s);
      o->t = _mm_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_NEON
      float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
      float s = (msg_getFloat(m,0) - x) / ((float) n);
      o->n = (int32x4_t) {n, n-1, n-2, n-3};
      o->x = (float32x4_t) {x, x+s, x+2.0f*s, x+3.0f*s};
      o->m = vdupq_n_f32(4.0f*s);
      o->t = vdupq_n_f32(msg_getFloat(m,0));
#else // HV_SIMD_NONE
      o->x = (o->n > 0) ? (o->x + o->m) : o->t; // new current value
      o->n = n; // new distance to target
      o->m = (msg_getFloat(m,0) - o->x) / ((float) n); // slope per sample
      o->t = msg_getFloat(m,0);
#endif
    } else {
      // Jump to value
#if HV_SIMD_AVX
      o->n = _mm_setzero_si128();
      o->x = _mm256_set1_ps(msg_getFloat(m,0));
      o->m = _mm256_setzero_ps();
      o->t = _mm256_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_SSE
      o->n = _mm_setzero_si128();
      o->x = _mm_set1_ps(msg_getFloat(m,0));
      o->m = _mm_setzero_ps();
      o->t = _mm_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_NEON
      o->n = vdupq_n_s32(0);
      o->x = vdupq_n_f32(0.0f);
      o->m = vdupq_n_f32(0.0f);
      o->t = vdupq_n_f32(0.0f);
#else // HV_SIMD_NONE
      o->n = 0;
      o->x = msg_getFloat(m,0);
      o->m = 0.0f;
      o->t = msg_getFloat(m,0);
#endif
    }
  } else if (msg_compareSymbol(m,0,"stop")) {
    // Stop line at current position
#if HV_SIMD_AVX
    // note o->n[1] is a 64-bit integer; two packed 32-bit ints. We only want to know if the high int is positive,
    // which can be done simply by testing the long int for positiveness.
    float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7];
    o->n = _mm_setzero_si128();
    o->x = _mm256_set1_ps(x);
    o->m = _mm256_setzero_ps();
    o->t = _mm256_set1_ps(x);
#elif HV_SIMD_SSE
    float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
    o->n = _mm_setzero_si128();
    o->x = _mm_set1_ps(x);
    o->m = _mm_setzero_ps();
    o->t = _mm_set1_ps(x);
#elif HV_SIMD_NEON
    float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
    o->n = vdupq_n_s32(0);
    o->x = vdupq_n_f32(x);
    o->m = vdupq_n_f32(0.0f);
    o->t = vdupq_n_f32(x);
#else // HV_SIMD_NONE
    o->n = 0;
    o->x += o->m;
    o->m = 0.0f;
    o->t = o->x;
#endif
  }
}
示例#24
0
void __hv_biquad_f_win32(SignalBiquad *o, hv_bInf_t *_bIn, hv_bInf_t *_bX0, hv_bInf_t *_bX1, hv_bInf_t *_bX2, hv_bInf_t *_bY1, hv_bInf_t *_bY2, hv_bOutf_t bOut) {
  hv_bInf_t bIn = *_bIn;
  hv_bInf_t bX0 = *_bX0;
  hv_bInf_t bX1 = *_bX1;
  hv_bInf_t bX2 = *_bX2;
  hv_bInf_t bY1 = *_bY1;
  hv_bInf_t bY2 = *_bY2;
#else
void __hv_biquad_f(SignalBiquad *o, hv_bInf_t bIn, hv_bInf_t bX0, hv_bInf_t bX1, hv_bInf_t bX2, hv_bInf_t bY1, hv_bInf_t bY2, hv_bOutf_t bOut) {
#endif
#if HV_SIMD_AVX
  __m256 a = _mm256_mul_ps(bIn, bX0);
  __m256 b = _mm256_mul_ps(o->xm1, bX1);
  __m256 c = _mm256_mul_ps(o->xm2, bX2);
  __m256 d = _mm256_add_ps(a, b);
  __m256 e = _mm256_add_ps(c, d); // bIn*bX0 + o->x1*bX1 + o->x2*bX2
  float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0];
  float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1];
  float y2 = e[2] - y1*bY1[2] - y0*bY2[2];
  float y3 = e[3] - y2*bY1[3] - y1*bY2[3];
  float y4 = e[4] - y3*bY1[4] - y2*bY2[4];
  float y5 = e[5] - y4*bY1[5] - y3*bY2[5];
  float y6 = e[6] - y5*bY1[6] - y4*bY2[6];
  float y7 = e[7] - y6*bY1[7] - y5*bY2[7];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y7;
  o->ym2 = y6;

  *bOut = _mm256_set_ps(y7, y6, y5, y4, y3, y2, y1, y0);
#elif HV_SIMD_SSE
  __m128 a = _mm_mul_ps(bIn, bX0);
  __m128 b = _mm_mul_ps(o->xm1, bX1);
  __m128 c = _mm_mul_ps(o->xm2, bX2);
  __m128 d = _mm_add_ps(a, b);
  __m128 e = _mm_add_ps(c, d);

  const float *const bbe = (float *) &e;
  const float *const bbY1 = (float *) &bY1;
  const float *const bbY2 = (float *) &bY2;

  float y0 = bbe[0] - o->ym1*bbY1[0] - o->ym2*bbY2[0];
  float y1 = bbe[1] - y0*bbY1[1] - o->ym1*bbY2[1];
  float y2 = bbe[2] - y1*bbY1[2] - y0*bbY2[2];
  float y3 = bbe[3] - y2*bbY1[3] - y1*bbY2[3];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y3;
  o->ym2 = y2;

  *bOut = _mm_set_ps(y3, y2, y1, y0);
#elif HV_SIMD_NEON
  float32x4_t a = vmulq_f32(bIn, bX0);
  float32x4_t b = vmulq_f32(o->xm1, bX1);
  float32x4_t c = vmulq_f32(o->xm2, bX2);
  float32x4_t d = vaddq_f32(a, b);
  float32x4_t e = vaddq_f32(c, d);
  float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0];
  float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1];
  float y2 = e[2] - y1*bY1[2] - y0*bY2[2];
  float y3 = e[3] - y2*bY1[3] - y1*bY2[3];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y3;
  o->ym2 = y2;

  *bOut = (float32x4_t) {y0, y1, y2, y3};
#else
  const float y = bIn*bX0 + o->xm1*bX1 + o->xm2*bX2 - o->ym1*bY1 - o->ym2*bY2;
  o->xm2 = o->xm1; o->xm1 = bIn;
  o->ym2 = o->ym1; o->ym1 = y;
  *bOut = y;
#endif
}
示例#25
0
void neuralNet::feedForward_layer(layerIterator_t nLayer)
{
	constFloatIterator_t pActivations, cWeight, endWeight;
	__m256 vTotal, vSub0, vSub1;
	__m256 *vWeight, *vAct, *vEndWeight;

	// summate each neuron's contribution
	for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); 
		cNeuron != end; 
		++cNeuron)
	{
		// foreach [previous neuron, current weight], up to endWeight
		pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex;
		cWeight = cNeuron->weightsBegin(*this);
		endWeight = cNeuron->weightsEnd(*this);

		// (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!)

		// summate all neurons of previous layer: (remaining batches of 8 neurons)
		vWeight = (__m256*)&cWeight[0];
		vAct = (__m256*)&pActivations[0];

		vEndWeight = (__m256*)&endWeight[0];

		// initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on:
		vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load?

		do // Take advantage of SIMD instructions by doing 16 multiplies per iteration
		{
			/* 
			 * each neuron's contribution is:
			 * input[j] += weight[i,j] * activation[i]
			 */
			// multiply:
			vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]);
			vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]);

			// prefetch next values: (these don't appear to help, are the networks too small for this to matter?)
			//_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0);
			//_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0);

			// add to accumulator:
			vTotal = _mm256_add_ps(vTotal, vSub0);
			vTotal = _mm256_add_ps(vTotal, vSub1);

			// increment pointers:
			vWeight += 2;
			vAct += 2;
		}
		while (vWeight != vEndWeight);

		//finalize: (combine all 4 accumulators)
		{
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			__m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1);
			vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal));

			// store the lowest float into cInput:
			_mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal);
		}
	}

	// activate all neurons in this layer:
	float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex);
	float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1);
	float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE

	// aligned activations:
	while (cActivation != lVectorActivation)
	{
		activation_approx_avx(cActivation, cActivation);
		cActivation += ALIGN_SIZE;
	};

	// postscript: (unaligned activations):
	{
		size_t dActivation = (lActivation - cActivation);
		switch(dActivation)
		{
		case 7:
			activation_approx(cActivation+6,cActivation+6);
		case 6:
			activation_approx(cActivation+5,cActivation+5);
		case 5:
			activation_approx(cActivation+4,cActivation+4);
		case 4:
			activation_approx_sse(cActivation+0,cActivation+0);
			break;
		case 3:
			activation_approx(cActivation+2, cActivation+2);
		case 2:
			activation_approx(cActivation+1, cActivation+1);
		case 1:
			activation_approx(cActivation+0, cActivation+0);
		case 0:
			break;
		}
	}
}; // endOf feedForward_layer
void sgdUpdateAvx(float learningRate,
                  float momentum,
                  float decayRate,
                  size_t size,
                  float* value,
                  const float* _grad,
                  float* momentumVec) {
#ifdef __AVX__
  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
                                            // but when invoke simd functions
                                            // need non-const pointer.
  size_t gradientAlign = 0;
  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
      << "Gradent buffer didn't align with momentum buffer";
  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
      << "Gradent buffer didn't align with value buffer";
  if (0 != gradientAlignHeader) {
    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
    gradientAlign = gradientAlignHeader / sizeof(real);

    // handle the unalign buffer
    for (size_t i = 0; i < gradientAlign; i++) {
      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
                       (decayRate * learningRate * value[i]);
      value[i] += momentumVec[i];
    }
    grad += gradientAlign;
    momentumVec += gradientAlign;
    value += gradientAlign;
  }

  constexpr size_t kParallelNum = 8;
  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
  size_t cntLoop = (size - gradientAlign) / nStepSize;
  size_t cntRem = (size - gradientAlign) % nStepSize;
  __m256 gradientTmp[kParallelNum];
  __m256 valueTmp[kParallelNum];
  __m256 lr, mom, dr;
  std::function<void(void)> loopFun;

  learningRate *= -1;
  lr = _mm256_set_ps(learningRate,
                     learningRate,
                     learningRate,
                     learningRate,
                     learningRate,
                     learningRate,
                     learningRate,
                     learningRate);

  if (0 != momentum) {
    mom = _mm256_set_ps(momentum,
                        momentum,
                        momentum,
                        momentum,
                        momentum,
                        momentum,
                        momentum,
                        momentum);
  }

  decayRate *= learningRate;
  if (0 != decayRate) {
    dr = _mm256_set_ps(decayRate,
                       decayRate,
                       decayRate,
                       decayRate,
                       decayRate,
                       decayRate,
                       decayRate,
                       decayRate);
  }

  auto gradMulFun = [&](void) {
    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
  };

  auto valueMulFun = [&](void) {
    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
  };

  auto momentumMulFun = [&](void) {
    *reinterpret_cast<__m256*>(momentumVec) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
    *reinterpret_cast<__m256*>(momentumVec + 8) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
    *reinterpret_cast<__m256*>(momentumVec + 16) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
    *reinterpret_cast<__m256*>(momentumVec + 24) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
    *reinterpret_cast<__m256*>(momentumVec + 32) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
    *reinterpret_cast<__m256*>(momentumVec + 40) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
    *reinterpret_cast<__m256*>(momentumVec + 48) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
    *reinterpret_cast<__m256*>(momentumVec + 56) =
        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
  };

  auto momentumAddGradFun = [&](void) {
    *reinterpret_cast<__m256*>(momentumVec) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
  };

  auto momentumZeroFun = [&](void) {
    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
  };

  auto momentumAddValueFun = [&](void) {
    *reinterpret_cast<__m256*>(momentumVec) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
    *reinterpret_cast<__m256*>(momentumVec + 8) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
  };

  auto valueAddMomentumFun = [&](void) {
    *reinterpret_cast<__m256*>(value) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
                      *reinterpret_cast<__m256*>(momentumVec));
    *reinterpret_cast<__m256*>(value + 8) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
                      *reinterpret_cast<__m256*>(momentumVec + 8));
    *reinterpret_cast<__m256*>(value + 16) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
                      *reinterpret_cast<__m256*>(momentumVec + 16));
    *reinterpret_cast<__m256*>(value + 24) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
                      *reinterpret_cast<__m256*>(momentumVec + 24));
    *reinterpret_cast<__m256*>(value + 32) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
                      *reinterpret_cast<__m256*>(momentumVec + 32));
    *reinterpret_cast<__m256*>(value + 40) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
                      *reinterpret_cast<__m256*>(momentumVec + 40));
    *reinterpret_cast<__m256*>(value + 48) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
                      *reinterpret_cast<__m256*>(momentumVec + 48));
    *reinterpret_cast<__m256*>(value + 56) =
        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
                      *reinterpret_cast<__m256*>(momentumVec + 56));
  };

  if (0 == decayRate && 0 == momentum) {
    loopFun = [&](void) {
      gradMulFun();
      momentumZeroFun();
      valueAddMomentumFun();
    };
  } else if (0 == decayRate && 0 != momentum) {
    loopFun = [&](void) {
      gradMulFun();
      momentumMulFun();
      momentumAddGradFun();
      valueAddMomentumFun();
    };
  } else if (0 != decayRate && 0 == momentum) {
    loopFun = [&](void) {
      gradMulFun();
      valueMulFun();
      momentumZeroFun();
      momentumAddValueFun();
      valueAddMomentumFun();
    };
  } else if (0 != decayRate && 0 != momentum) {
    loopFun = [&](void) {
      gradMulFun();
      valueMulFun();
      momentumMulFun();
      momentumAddGradFun();
      momentumAddValueFun();
      valueAddMomentumFun();
    };
  }

  for (size_t i = 0; i < cntLoop; i++) {
    loopFun();
    grad += nStepSize;
    momentumVec += nStepSize;
    value += nStepSize;
  }

  for (size_t i = 0; i < cntRem; i++) {
    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
                     (decayRate * value[i]);
    value[i] += momentumVec[i];
  }
#endif
}