void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
double compute_pi_leibniz_avx_opt_single(size_t n) { double pi = 0.0; register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256 ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0); ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0); ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0); ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0); ymm13 = _mm256_set1_ps(64.0); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); for (int i = 0; i <= n - 32; i += 32) { ymm9 = _mm256_div_ps(ymm0, ymm1); ymm1 = _mm256_add_ps(ymm1, ymm13); ymm10 = _mm256_div_ps(ymm0, ymm2); ymm2 = _mm256_add_ps(ymm2, ymm13); ymm11 = _mm256_div_ps(ymm0, ymm3); ymm3 = _mm256_add_ps(ymm3, ymm13); ymm12 = _mm256_div_ps(ymm0, ymm4); ymm4 = _mm256_add_ps(ymm4, ymm13); ymm5 = _mm256_add_ps(ymm5, ymm9); ymm6 = _mm256_add_ps(ymm6, ymm10); ymm7 = _mm256_add_ps(ymm7, ymm11); ymm8 = _mm256_add_ps(ymm8, ymm12); } float tmp[8] __attribute__((aligned(32))); _mm256_store_ps(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; return pi * 4.0; }
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl) { int zigzag; __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Set the dct_values for the current interation dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]], in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]], in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]], in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(dct_values, factor); // Load quant-values and multiply with previous product quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_div_ps(result, quant_values); // Round off values and store in out_data buffer result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(out_data + zigzag, result); } }
void mandel_avx(unsigned char *image, const struct spec *s) { __m256 xmin = _mm256_set1_ps(s->xlim[0]); __m256 ymin = _mm256_set1_ps(s->ylim[0]); __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width); __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height); __m256 threshold = _mm256_set1_ps(4); __m256 one = _mm256_set1_ps(1); __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations); __m256 depth_scale = _mm256_set1_ps(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 8) { __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x + 0); __m256 my = _mm256_set1_ps(y); __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin); __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin); __m256 zr = cr; __m256 zi = ci; int k = 1; __m256 mk = _mm256_set1_ps(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m256 zr2 = _mm256_mul_ps(zr, zr); __m256 zi2 = _mm256_mul_ps(zi, zi); __m256 zrzi = _mm256_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr); zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm256_mul_ps(zr, zr); zi2 = _mm256_mul_ps(zi, zi); __m256 mag2 = _mm256_add_ps(zr2, zi2); __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS); mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk); /* Early bailout? */ if (_mm256_testz_ps(mask, _mm256_set1_ps(-1))) break; } mk = _mm256_mul_ps(mk, iter_scale); mk = _mm256_sqrt_ps(mk); mk = _mm256_mul_ps(mk, depth_scale); __m256i pixels = _mm256_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 8; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
void static avx_test (void) { int i; union256 u, s1, s2; float e[8]; s1.x = _mm256_set_ps (24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4); s2.x = _mm256_set_ps (1.17, 2.16, 3.15, 4.14, 5.13, 6.12, 7.11, 8.9); u.x = _mm256_div_ps (s1.x, s2.x); for (i = 0; i < 8; i++) e[i] = s1.a[i] / s2.a[i]; if (check_union256 (u, e)) abort (); }
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale)); __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2); for (int i = 0; i < 8; i++) { if (close.m256_f32[i] == 0) { numNeighbours[7 - i] = 0; } } __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
/* AVX implementation for Minimum Mean Squared Error (MMSE) solver */ inline void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11, __m256 *x0, __m256 *x1, float noise_estimate, float norm) { __m256 _noise_estimate = _mm256_set_ps(0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate); __m256 _norm = _mm256_set1_ps(norm); /* Create conjugated matrix */ __m256 _h00 = _MM256_CONJ_PS(h00); __m256 _h01 = _MM256_CONJ_PS(h01); __m256 _h10 = _MM256_CONJ_PS(h10); __m256 _h11 = _MM256_CONJ_PS(h11); /* 1. A = H' x H + No*/ #ifdef LV_HAVE_FMA __m256 a00 = _MM256_SQMOD_ADD_PS(h00, h10, _noise_estimate); __m256 a01 = _MM256_PROD_ADD_PS(_h00, h01, _MM256_PROD_PS(_h10, h11)); __m256 a10 = _MM256_PROD_ADD_PS(_h01, h00, _MM256_PROD_PS(_h11, h10)); __m256 a11 = _MM256_SQMOD_ADD_PS(h01, h11, _noise_estimate); #else __m256 a00 = _mm256_add_ps(_MM256_SQMOD_PS(h00, h10), _noise_estimate); __m256 a01 = _mm256_add_ps(_MM256_PROD_PS(_h00, h01), _MM256_PROD_PS(_h10, h11)); __m256 a10 = _mm256_add_ps(_MM256_PROD_PS(_h01, h00), _MM256_PROD_PS(_h11, h10)); __m256 a11 = _mm256_add_ps(_MM256_SQMOD_PS(h01, h11), _noise_estimate); #endif /* LV_HAVE_FMA */ /* 2. B = inv(H' x H + No) = inv(A) */ __m256 b00 = a11; __m256 b01 = _mm256_xor_ps(a01, _mm256_set1_ps(-0.0f)); __m256 b10 = _mm256_xor_ps(a10, _mm256_set1_ps(-0.0f)); __m256 b11 = a00; _norm = _mm256_mul_ps(_norm, srslte_mat_cf_recip_avx(srslte_mat_2x2_det_avx(a00, a01, a10, a11))); /* 3. W = inv(H' x H + No) x H' = B x H' */ #ifdef LV_HAVE_FMA __m256 w00 = _MM256_PROD_ADD_PS(b00, _h00, _MM256_PROD_PS(b01, _h01)); __m256 w01 = _MM256_PROD_ADD_PS(b00, _h10, _MM256_PROD_PS(b01, _h11)); __m256 w10 = _MM256_PROD_ADD_PS(b10, _h00, _MM256_PROD_PS(b11, _h01)); __m256 w11 = _MM256_PROD_ADD_PS(b10, _h10, _MM256_PROD_PS(b11, _h11)); #else __m256 w00 = _mm256_add_ps(_MM256_PROD_PS(b00, _h00), _MM256_PROD_PS(b01, _h01)); __m256 w01 = _mm256_add_ps(_MM256_PROD_PS(b00, _h10), _MM256_PROD_PS(b01, _h11)); __m256 w10 = _mm256_add_ps(_MM256_PROD_PS(b10, _h00), _MM256_PROD_PS(b11, _h01)); __m256 w11 = _mm256_add_ps(_MM256_PROD_PS(b10, _h10), _MM256_PROD_PS(b11, _h11)); #endif /* LV_HAVE_FMA */ /* 4. X = W x Y */ #ifdef LV_HAVE_FMA *x0 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w00, _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w10, _MM256_PROD_PS(y1, w11)), _norm); #else *x0 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w00), _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w10), _MM256_PROD_PS(y1, w11)), _norm); #endif /* LV_HAVE_FMA */ }
void avx2_masked_csr_spmv(float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y) { int32_t A_offset = 0; __m256 bitMasks[9]; unsigned u = 0x80000000; float v = *((float*)&u); bitMasks[0] = _mm256_set_ps(0,0,0,0,0,0,0,0); bitMasks[1] = _mm256_set_ps(0,0,0,0,0,0,0,v); bitMasks[2] = _mm256_set_ps(0,0,0,0,0,0,v,v); bitMasks[3] = _mm256_set_ps(0,0,0,0,0,v,v,v); bitMasks[4] = _mm256_set_ps(0,0,0,0,v,v,v,v); bitMasks[5] = _mm256_set_ps(0,0,0,v,v,v,v,v); bitMasks[6] = _mm256_set_ps(0,0,v,v,v,v,v,v); bitMasks[7] = _mm256_set_ps(0,v,v,v,v,v,v,v); bitMasks[8] = _mm256_set_ps(v,v,v,v,v,v,v,v); const __m256 vZeros = _mm256_setzero_ps(); for(int32_t i = 0; i < n; i++) { int32_t nElem = nIdx[i]; float t = 0.0f; __m256 vT = _mm256_setzero_ps(); int32_t k = 0; while(k < nElem) { int vl = ((k+8) < nElem) ? 8 : (nElem - k); __m256 mask = bitMasks[vl]; /* this is padded out */ __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][k])); __m256 vX = _mm256_mask_i32gather_ps(vZeros,(float const*)x,vIdx,mask,4); __m256 vA = _mm256_loadu_ps(&A[A_offset + k]); vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA)); k += vl; } t += sum8(vT); y[i] = t; A_offset += nElem; } }
void static avx_test (void) { union256 u; float e [8] __attribute__ ((aligned (32))) = {0.0}; u.x = _mm256_set_ps (1.17, 24567.16, 3.15, 4567.14, 5.13, 65467.12, 788.11, 8.9); test (e, u.x); if (check_union256 (u, e)) abort (); }
void static avx_test (void) { union256 u, s1, s2; float e [8]; s1.x = _mm256_set_ps (1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8); s2.x = _mm256_set_ps (2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8); u.x = _mm256_shuffle_ps (s1.x, s2.x, MASK); e[0] = select4(s1.a, (MASK >> 0) & 0x3); e[1] = select4(s1.a, (MASK >> 2) & 0x3); e[2] = select4(s2.a, (MASK >> 4) & 0x3); e[3] = select4(s2.a, (MASK >> 6) & 0x3); e[4] = select4(s1.a+4, (MASK >> 0) & 0x3); e[5] = select4(s1.a+4, (MASK >> 2) & 0x3); e[6] = select4(s2.a+4, (MASK >> 4) & 0x3); e[7] = select4(s2.a+4, (MASK >> 6) & 0x3); if (check_union256 (u, e)) abort (); }
void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_ps(y+i); YMM1 = _mm256_loadu_ps(x+i); YMM2 = _mm256_mul_ps(YMM0, YMM15); YMM3 = _mm256_add_ps(YMM1, YMM2); _mm256_storeu_ps(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c); for (i=0; i<=((n)-32); i+=32) { _mm256_storeu_ps((x)+i , YMM0); _mm256_storeu_ps((x)+i+8, YMM0); _mm256_storeu_ps((x)+i+16, YMM0); _mm256_storeu_ps((x)+i+24, YMM0); } off = (n) - ((n)%32); for (i=0; i<((n)%32); i++) { x[off+i] = c; } }
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM0 = _mm256_add_ps(YMM0, YMM15); YMM1 = _mm256_add_ps(YMM1, YMM15); _mm256_storeu_ps(y+i, YMM0); _mm256_storeu_ps(y+i+8, YMM1); } for (; i<(n); i++) { y[i] = x[i] + c; } }
void static avx_test (void) { int i; union256 u, s1; float e[8]; s1.x = _mm256_set_ps (134.3, 1234.54, 45.335, 646.456, 43.54, 473.34, 78, 89.54); u.x = _mm256_movehdup_ps (s1.x); for (i = 0; i < 4; i++) e[2*i] = e[2*i+1] = s1.a[2*i+1]; if (check_union256 (u, e)) abort (); }
void static avx_test (void) { int i; union256 s1; union256i_d u; int e [8]; s1.x = _mm256_set_ps (45.64, 4564.56, 2.3, 5.5, 57.57, 89.34, 54.12, 954.67); u.x = _mm256_cvttps_epi32 (s1.x); for (i = 0; i < 8; i++) e[i] = (int)s1.a[i]; if (check_union256i_d (u, e)) abort (); }
//------------------------------------------------------------------- // effect static void effect(float *pBlu, float *pGrn, float *pRed, const size_t height, const size_t width) { __m256 zeroPs = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0); float *b = pBlu; float *r = pRed; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width / 8; x++, b += 8, r += 8) { _mm256_store_ps(b, zeroPs); // B // G, skip _mm256_store_ps(r, zeroPs); // R } } }
void warmup(float *x, float *y, int size, float alpha) { #pragma ivdep int i; __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0); #pragma vector aligned for (i=0; i<size; i+=4) { __m256 t = _mm256_load_ps(x+2*i); __m256 l = _mm256_mul_ps(t, m); // premultiply __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits __m256 res = _mm256_hadd_ps(l, r); __m128 s = _mm256_extractf128_ps (res, 0); _mm_store_ps(y+i,s); // store it } }
matrix_t matrixMulFMA(const matrix_t & matrixA, const matrix_t & matrixB) { auto dimension = matrixA.size(); assert(matrixA.size() == dimension); assert(matrixA[0].size() == dimension); assert(matrixB.size() == dimension); assert(matrixB[0].size() == dimension); matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix const int vec = 8; int start{0}; if(dimension > vec) { start = dimension - dimension % vec; for(int x{0}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) { const __m256 a = _mm256_set_ps(matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i]);// unaligned read for(int y{0}; y < dimension - vec; y += vec) { //__m256 c = _mm256_set_ps(matrixC[x][y+7], matrixC[x][y+6], matrixC[x][y+5], matrixC[x][y+4], matrixC[x][y+3], matrixC[x][y+2], matrixC[x][y+1], matrixC[x][y+0]); //const __m256 b = _mm256_set_ps(matrixB[i][y+7], matrixB[i][y+6], matrixB[i][y+5], matrixB[i][y+4], matrixB[i][y+3], matrixB[i][y+2], matrixB[i][y+1], matrixB[i][y+0]); __m256 c = *reinterpret_cast<__m256*>(&matrixC[x][y]);// aligned read const __m256 & b = *reinterpret_cast<const __m256*>(&matrixB[i][y]);// aligned read c = _mm256_fmadd_ps(a, b, c);//c = a * b + c; //_mm256_store_ps(&matrixC[x][y], c);//aligned //_mm256_storeu_ps(&matrixC[x][y], c);//unaligned /* float c[8]; c[0] = matrixC[i][y+0]; c[1] = matrixC[i][y+1]; c[2] = matrixC[i][y+2]; c[3] = matrixC[i][y+3]; c[4] = matrixC[i][y+4]; c[5] = matrixC[i][y+5]; c[6] = matrixC[i][y+6]; c[7] = matrixC[i][y+7]; c[0] += matrixA[x][i] * matrixB[i][y+0]; c[1] += matrixA[x][i] * matrixB[i][y+1]; c[2] += matrixA[x][i] * matrixB[i][y+2]; c[3] += matrixA[x][i] * matrixB[i][y+3]; c[4] += matrixA[x][i] * matrixB[i][y+4]; c[5] += matrixA[x][i] * matrixB[i][y+5]; c[6] += matrixA[x][i] * matrixB[i][y+6]; c[7] += matrixA[x][i] * matrixB[i][y+7]; //*/ //* matrixC[x][y+0] = c[0]; matrixC[x][y+1] = c[1]; matrixC[x][y+2] = c[2]; matrixC[x][y+3] = c[3]; matrixC[x][y+4] = c[4]; matrixC[x][y+5] = c[5]; matrixC[x][y+6] = c[6]; matrixC[x][y+7] = c[7]; //*/ /*is doing this matrixC[x][y+0] += matrixA[x][i] * matrixB[i][y+0]; matrixC[x][y+1] += matrixA[x][i] * matrixB[i][y+1]; matrixC[x][y+2] += matrixA[x][i] * matrixB[i][y+2]; matrixC[x][y+3] += matrixA[x][i] * matrixB[i][y+3]; matrixC[x][y+4] += matrixA[x][i] * matrixB[i][y+4]; matrixC[x][y+5] += matrixA[x][i] * matrixB[i][y+5]; matrixC[x][y+6] += matrixA[x][i] * matrixB[i][y+6]; matrixC[x][y+7] += matrixA[x][i] * matrixB[i][y+7]; //*/ } } } //calculate remaining columns for(int x{0}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) for(int y{start}; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; return matrixC;//move semantics ftw }
void TransLut::process_plane_flt_any_avx2 (uint8_t *dst_ptr, const uint8_t *src_ptr, int stride_dst, int stride_src, int w, int h) { assert (dst_ptr != 0); assert (src_ptr != 0); assert (stride_dst != 0 || h == 1); assert (stride_src != 0 || h == 1); assert (w > 0); assert (h > 0); for (int y = 0; y < h; ++y) { const FloatIntMix * s_ptr = reinterpret_cast <const FloatIntMix *> (src_ptr); TD * d_ptr = reinterpret_cast < TD *> (dst_ptr); for (int x = 0; x < w; x += 8) { union { __m256i _vect; uint32_t _scal [8]; } index; __m256 lerp; TransLut_FindIndexAvx2 <M>::find_index (s_ptr + x, index._vect, lerp); #if 1 // Looks as fast as _mm256_set_ps // G++ complains about sizeof() as argument __m256 val = _mm256_i32gather_ps ( &_lut.use <float> (0), index._vect, 4 // 4 == sizeof (float) ); const __m256 va2 = _mm256_i32gather_ps ( &_lut.use <float> (1), index._vect, 4 // 4 == sizeof (float) ); #else __m256 val = _mm256_set_ps ( _lut.use <float> (index._scal [7] ), _lut.use <float> (index._scal [6] ), _lut.use <float> (index._scal [5] ), _lut.use <float> (index._scal [4] ), _lut.use <float> (index._scal [3] ), _lut.use <float> (index._scal [2] ), _lut.use <float> (index._scal [1] ), _lut.use <float> (index._scal [0] ) ); const __m256 va2 = _mm256_set_ps ( _lut.use <float> (index._scal [7] + 1), _lut.use <float> (index._scal [6] + 1), _lut.use <float> (index._scal [5] + 1), _lut.use <float> (index._scal [4] + 1), _lut.use <float> (index._scal [3] + 1), _lut.use <float> (index._scal [2] + 1), _lut.use <float> (index._scal [1] + 1), _lut.use <float> (index._scal [0] + 1) ); #endif const __m256 dif = _mm256_sub_ps (va2, val); val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp)); TransLut_store_avx2 (&d_ptr [x], val); } src_ptr += stride_src; dst_ptr += stride_dst; } _mm256_zeroupper (); // Back to SSE state }
foo (float *v) { return _mm256_set_ps (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]); }
foo (float x) { return _mm256_set_ps (x, x, x, x, x, x, x, x); }
void sLine_onMessage(HvBase *_c, SignalLine *o, int letIn, const HvMessage * const m, void *sendMessage) { if (msg_isFloat(m,0)) { if (msg_isFloat(m,1)) { // new ramp int n = ctx_millisecondsToSamples(_c, msg_getFloat(m,1)); #if HV_SIMD_AVX float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; // current output value float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample o->n = _mm_set_epi32(n-3, n-2, n-1, n); o->x = _mm256_set_ps(x+7.0f*s, x+6.0f*s, x+5.0f*s, x+4.0f*s, x+3.0f*s, x+2.0f*s, x+s, x); o->m = _mm256_set1_ps(8.0f*s); o->t = _mm256_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_SSE float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample o->n = _mm_set_epi32(n-3, n-2, n-1, n); o->x = _mm_set_ps(x+3.0f*s, x+2.0f*s, x+s, x); o->m = _mm_set1_ps(4.0f*s); o->t = _mm_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_NEON float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; float s = (msg_getFloat(m,0) - x) / ((float) n); o->n = (int32x4_t) {n, n-1, n-2, n-3}; o->x = (float32x4_t) {x, x+s, x+2.0f*s, x+3.0f*s}; o->m = vdupq_n_f32(4.0f*s); o->t = vdupq_n_f32(msg_getFloat(m,0)); #else // HV_SIMD_NONE o->x = (o->n > 0) ? (o->x + o->m) : o->t; // new current value o->n = n; // new distance to target o->m = (msg_getFloat(m,0) - o->x) / ((float) n); // slope per sample o->t = msg_getFloat(m,0); #endif } else { // Jump to value #if HV_SIMD_AVX o->n = _mm_setzero_si128(); o->x = _mm256_set1_ps(msg_getFloat(m,0)); o->m = _mm256_setzero_ps(); o->t = _mm256_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_SSE o->n = _mm_setzero_si128(); o->x = _mm_set1_ps(msg_getFloat(m,0)); o->m = _mm_setzero_ps(); o->t = _mm_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_NEON o->n = vdupq_n_s32(0); o->x = vdupq_n_f32(0.0f); o->m = vdupq_n_f32(0.0f); o->t = vdupq_n_f32(0.0f); #else // HV_SIMD_NONE o->n = 0; o->x = msg_getFloat(m,0); o->m = 0.0f; o->t = msg_getFloat(m,0); #endif } } else if (msg_compareSymbol(m,0,"stop")) { // Stop line at current position #if HV_SIMD_AVX // note o->n[1] is a 64-bit integer; two packed 32-bit ints. We only want to know if the high int is positive, // which can be done simply by testing the long int for positiveness. float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; o->n = _mm_setzero_si128(); o->x = _mm256_set1_ps(x); o->m = _mm256_setzero_ps(); o->t = _mm256_set1_ps(x); #elif HV_SIMD_SSE float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; o->n = _mm_setzero_si128(); o->x = _mm_set1_ps(x); o->m = _mm_setzero_ps(); o->t = _mm_set1_ps(x); #elif HV_SIMD_NEON float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; o->n = vdupq_n_s32(0); o->x = vdupq_n_f32(x); o->m = vdupq_n_f32(0.0f); o->t = vdupq_n_f32(x); #else // HV_SIMD_NONE o->n = 0; o->x += o->m; o->m = 0.0f; o->t = o->x; #endif } }
void __hv_biquad_f_win32(SignalBiquad *o, hv_bInf_t *_bIn, hv_bInf_t *_bX0, hv_bInf_t *_bX1, hv_bInf_t *_bX2, hv_bInf_t *_bY1, hv_bInf_t *_bY2, hv_bOutf_t bOut) { hv_bInf_t bIn = *_bIn; hv_bInf_t bX0 = *_bX0; hv_bInf_t bX1 = *_bX1; hv_bInf_t bX2 = *_bX2; hv_bInf_t bY1 = *_bY1; hv_bInf_t bY2 = *_bY2; #else void __hv_biquad_f(SignalBiquad *o, hv_bInf_t bIn, hv_bInf_t bX0, hv_bInf_t bX1, hv_bInf_t bX2, hv_bInf_t bY1, hv_bInf_t bY2, hv_bOutf_t bOut) { #endif #if HV_SIMD_AVX __m256 a = _mm256_mul_ps(bIn, bX0); __m256 b = _mm256_mul_ps(o->xm1, bX1); __m256 c = _mm256_mul_ps(o->xm2, bX2); __m256 d = _mm256_add_ps(a, b); __m256 e = _mm256_add_ps(c, d); // bIn*bX0 + o->x1*bX1 + o->x2*bX2 float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; float y4 = e[4] - y3*bY1[4] - y2*bY2[4]; float y5 = e[5] - y4*bY1[5] - y3*bY2[5]; float y6 = e[6] - y5*bY1[6] - y4*bY2[6]; float y7 = e[7] - y6*bY1[7] - y5*bY2[7]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y7; o->ym2 = y6; *bOut = _mm256_set_ps(y7, y6, y5, y4, y3, y2, y1, y0); #elif HV_SIMD_SSE __m128 a = _mm_mul_ps(bIn, bX0); __m128 b = _mm_mul_ps(o->xm1, bX1); __m128 c = _mm_mul_ps(o->xm2, bX2); __m128 d = _mm_add_ps(a, b); __m128 e = _mm_add_ps(c, d); const float *const bbe = (float *) &e; const float *const bbY1 = (float *) &bY1; const float *const bbY2 = (float *) &bY2; float y0 = bbe[0] - o->ym1*bbY1[0] - o->ym2*bbY2[0]; float y1 = bbe[1] - y0*bbY1[1] - o->ym1*bbY2[1]; float y2 = bbe[2] - y1*bbY1[2] - y0*bbY2[2]; float y3 = bbe[3] - y2*bbY1[3] - y1*bbY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = _mm_set_ps(y3, y2, y1, y0); #elif HV_SIMD_NEON float32x4_t a = vmulq_f32(bIn, bX0); float32x4_t b = vmulq_f32(o->xm1, bX1); float32x4_t c = vmulq_f32(o->xm2, bX2); float32x4_t d = vaddq_f32(a, b); float32x4_t e = vaddq_f32(c, d); float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = (float32x4_t) {y0, y1, y2, y3}; #else const float y = bIn*bX0 + o->xm1*bX1 + o->xm2*bX2 - o->ym1*bY1 - o->ym2*bY2; o->xm2 = o->xm1; o->xm1 = bIn; o->ym2 = o->ym1; o->ym1 = y; *bOut = y; #endif }
void neuralNet::feedForward_layer(layerIterator_t nLayer) { constFloatIterator_t pActivations, cWeight, endWeight; __m256 vTotal, vSub0, vSub1; __m256 *vWeight, *vAct, *vEndWeight; // summate each neuron's contribution for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); cNeuron != end; ++cNeuron) { // foreach [previous neuron, current weight], up to endWeight pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex; cWeight = cNeuron->weightsBegin(*this); endWeight = cNeuron->weightsEnd(*this); // (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!) // summate all neurons of previous layer: (remaining batches of 8 neurons) vWeight = (__m256*)&cWeight[0]; vAct = (__m256*)&pActivations[0]; vEndWeight = (__m256*)&endWeight[0]; // initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on: vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load? do // Take advantage of SIMD instructions by doing 16 multiplies per iteration { /* * each neuron's contribution is: * input[j] += weight[i,j] * activation[i] */ // multiply: vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]); vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]); // prefetch next values: (these don't appear to help, are the networks too small for this to matter?) //_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0); //_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0); // add to accumulator: vTotal = _mm256_add_ps(vTotal, vSub0); vTotal = _mm256_add_ps(vTotal, vSub1); // increment pointers: vWeight += 2; vAct += 2; } while (vWeight != vEndWeight); //finalize: (combine all 4 accumulators) { vTotal = _mm256_hadd_ps(vTotal, vTotal); vTotal = _mm256_hadd_ps(vTotal, vTotal); __m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1); vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal)); // store the lowest float into cInput: _mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal); } } // activate all neurons in this layer: float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex); float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1); float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE // aligned activations: while (cActivation != lVectorActivation) { activation_approx_avx(cActivation, cActivation); cActivation += ALIGN_SIZE; }; // postscript: (unaligned activations): { size_t dActivation = (lActivation - cActivation); switch(dActivation) { case 7: activation_approx(cActivation+6,cActivation+6); case 6: activation_approx(cActivation+5,cActivation+5); case 5: activation_approx(cActivation+4,cActivation+4); case 4: activation_approx_sse(cActivation+0,cActivation+0); break; case 3: activation_approx(cActivation+2, cActivation+2); case 2: activation_approx(cActivation+1, cActivation+1); case 1: activation_approx(cActivation+0, cActivation+0); case 0: break; } } }; // endOf feedForward_layer
void sgdUpdateAvx(float learningRate, float momentum, float decayRate, size_t size, float* value, const float* _grad, float* momentumVec) { #ifdef __AVX__ float* grad = const_cast<float*>(_grad); // the gradient is not modified // but when invoke simd functions // need non-const pointer. size_t gradientAlign = 0; size_t gradientAlignHeader = (size_t)grad % sizeof(__m256); CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256)) << "Gradent buffer didn't align with momentum buffer"; CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256)) << "Gradent buffer didn't align with value buffer"; if (0 != gradientAlignHeader) { gradientAlignHeader = sizeof(__m256) - gradientAlignHeader; gradientAlign = gradientAlignHeader / sizeof(real); // handle the unalign buffer for (size_t i = 0; i < gradientAlign; i++) { momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) - (decayRate * learningRate * value[i]); value[i] += momentumVec[i]; } grad += gradientAlign; momentumVec += gradientAlign; value += gradientAlign; } constexpr size_t kParallelNum = 8; constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum; size_t cntLoop = (size - gradientAlign) / nStepSize; size_t cntRem = (size - gradientAlign) % nStepSize; __m256 gradientTmp[kParallelNum]; __m256 valueTmp[kParallelNum]; __m256 lr, mom, dr; std::function<void(void)> loopFun; learningRate *= -1; lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate, learningRate, learningRate, learningRate, learningRate); if (0 != momentum) { mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum, momentum, momentum, momentum); } decayRate *= learningRate; if (0 != decayRate) { dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate, decayRate, decayRate, decayRate); } auto gradMulFun = [&](void) { gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr); gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr); gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr); gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr); gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr); gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr); gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr); gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr); }; auto valueMulFun = [&](void) { valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr); valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr); valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr); valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr); valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr); valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr); valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr); valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr); }; auto momentumMulFun = [&](void) { *reinterpret_cast<__m256*>(momentumVec) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom); *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom); *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom); *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom); *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom); *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom); *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom); *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom); }; auto momentumAddGradFun = [&](void) { *reinterpret_cast<__m256*>(momentumVec) = _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]); *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]); *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]); *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]); *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]); *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]); *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]); *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]); }; auto momentumZeroFun = [&](void) { *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0]; *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1]; *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2]; *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3]; *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4]; *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5]; *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6]; *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7]; }; auto momentumAddValueFun = [&](void) { *reinterpret_cast<__m256*>(momentumVec) = _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]); *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]); *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]); *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]); *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]); *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]); *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]); *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps( *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]); }; auto valueAddMomentumFun = [&](void) { *reinterpret_cast<__m256*>(value) = _mm256_add_ps(*reinterpret_cast<__m256*>(value), *reinterpret_cast<__m256*>(momentumVec)); *reinterpret_cast<__m256*>(value + 8) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8), *reinterpret_cast<__m256*>(momentumVec + 8)); *reinterpret_cast<__m256*>(value + 16) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16), *reinterpret_cast<__m256*>(momentumVec + 16)); *reinterpret_cast<__m256*>(value + 24) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24), *reinterpret_cast<__m256*>(momentumVec + 24)); *reinterpret_cast<__m256*>(value + 32) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32), *reinterpret_cast<__m256*>(momentumVec + 32)); *reinterpret_cast<__m256*>(value + 40) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40), *reinterpret_cast<__m256*>(momentumVec + 40)); *reinterpret_cast<__m256*>(value + 48) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48), *reinterpret_cast<__m256*>(momentumVec + 48)); *reinterpret_cast<__m256*>(value + 56) = _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56), *reinterpret_cast<__m256*>(momentumVec + 56)); }; if (0 == decayRate && 0 == momentum) { loopFun = [&](void) { gradMulFun(); momentumZeroFun(); valueAddMomentumFun(); }; } else if (0 == decayRate && 0 != momentum) { loopFun = [&](void) { gradMulFun(); momentumMulFun(); momentumAddGradFun(); valueAddMomentumFun(); }; } else if (0 != decayRate && 0 == momentum) { loopFun = [&](void) { gradMulFun(); valueMulFun(); momentumZeroFun(); momentumAddValueFun(); valueAddMomentumFun(); }; } else if (0 != decayRate && 0 != momentum) { loopFun = [&](void) { gradMulFun(); valueMulFun(); momentumMulFun(); momentumAddGradFun(); momentumAddValueFun(); valueAddMomentumFun(); }; } for (size_t i = 0; i < cntLoop; i++) { loopFun(); grad += nStepSize; momentumVec += nStepSize; value += nStepSize; } for (size_t i = 0; i < cntRem; i++) { momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) + (decayRate * value[i]); value[i] += momentumVec[i]; } #endif }