void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { VMLLONG i=0; VMLLONG loop_count=(n) >> 5; VMLLONG remain_count=(n) & 0x1f; while(loop_count>0){ __m256 av0=_mm256_loadu_ps(a); __m256 av1=_mm256_loadu_ps(a+8); __m256 av2=_mm256_loadu_ps(a+16); __m256 av3=_mm256_loadu_ps(a+24); __m256 yv0=exp256_ps(av0); __m256 yv1=exp256_ps(av1); __m256 yv2=exp256_ps(av2); __m256 yv3=exp256_ps(av3); _mm256_storeu_ps(y, yv0); _mm256_storeu_ps(y+8, yv1); _mm256_storeu_ps(y+16, yv2); _mm256_storeu_ps(y+24, yv3); a+=32; b+=32; y+=32; loop_count--; } for(i=0; i<n; i++){ y[i]=expf(a[i]); } }
void UnalignedAvxMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 8) { __m256 v1 = _mm256_loadu_ps(&a[i]); __m256 v2 = _mm256_loadu_ps(&b[i]); __m256 r = _mm256_mul_ps(v1, v2); _mm256_storeu_ps(&d[i], r); } }
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for (; i <= width - 16; i += 16) { __m256 f, s0 = d8, s1 = d8; __m256 x0; S = src[0] + i; for (k = 1; k <= ksize2; k++) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for (; i <= width - 4; i += 4) { __m128 f, x0, s0 = d4; for (k = 1; k <= ksize2; k++) { f = _mm_set1_ps(ky[k]); x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; for (i=0; i<=((n)-16); i+=16) { _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i)); _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8)); } off = (n) - ((n)%16); for (i=0; i<((n)%16); i++) { y[off+i] = x[off+i]; } }
extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) { for(int i=0; i<n; i++) { __m256 t1 = _mm256_loadu_ps(&c[i*n + 0]); __m256 t2 = _mm256_loadu_ps(&c[i*n + 8]); __m256 t3 = _mm256_loadu_ps(&c[i*n + 16]); __m256 t4 = _mm256_loadu_ps(&c[i*n + 24]); for(int k=0; k<n; k++) { __m256 a1 = _mm256_set1_ps(a[k*n+i]); __m256 b1 = _mm256_loadu_ps(&b[k*n+0]); t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1)); __m256 b2 = _mm256_loadu_ps(&b[k*n+8]); t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2)); __m256 b3 = _mm256_loadu_ps(&b[k*n+16]); t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3)); __m256 b4 = _mm256_loadu_ps(&b[k*n+24]); t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4)); } _mm256_storeu_ps(&c[i*n + 0], t1); _mm256_storeu_ps(&c[i*n + 8], t2); _mm256_storeu_ps(&c[i*n + 16], t3); _mm256_storeu_ps(&c[i*n + 24], t4); } }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
/* Adjust MBR to fit all child MBRs */ inline void adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur) { ArraySTRNode *node, *child; ulong_t k; node = &nodes[cur]; child = &nodes[node->pos]; /* enlarge mbr to include all childlen's mbr */ #ifdef ENABLE_SSE_ADJUST { __m128 v_nlow = _mm_load_ps(child[0].mbr.low); __m128 v_nupp = _mm_load_ps(child[0].mbr.upp); for (k = 1; k < node->len; k++) { v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low)); v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp)); } _mm_store_ps(node->mbr.low, v_nlow); _mm_store_ps(node->mbr.upp, v_nupp); } #else #ifdef ENABLE_AVX_TEST1 { __m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr); for (k = 1; k < node->len; k++) { __m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12); } _mm256_storeu_ps((float *)&node->mbr, v_nmbr); } #else /* copy first child's mbr */ node->mbr = child[0].mbr; for (k = 1; k < node->len; k++) { int i; for (i = 0; i < NDIMS; i++) { if (node->mbr.low[i] > child[k].mbr.low[i]) node->mbr.low[i] = child[k].mbr.low[i]; if (node->mbr.upp[i] < child[k].mbr.upp[i]) node->mbr.upp[i] = child[k].mbr.upp[i]; } } #endif #endif }
void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_ps(y+i); YMM1 = _mm256_loadu_ps(x+i); YMM2 = _mm256_mul_ps(YMM0, YMM15); YMM3 = _mm256_add_ps(YMM1, YMM2); _mm256_storeu_ps(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
void avx2_csr_spmv( float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y) { int32_t A_offset = 0; for(int32_t i = 0; i < n; i++) { int32_t nElem = nIdx[i]; float t = 0.0f; __m256 vT = _mm256_setzero_ps(); int32_t smLen = nElem - (nElem & 7); for(int32_t j = 0; j < smLen; j+=8) { __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][j])); __m256 vX = _mm256_i32gather_ps((float const*)x,vIdx,4); __m256 vA = _mm256_loadu_ps(&A[A_offset + j]); vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA)); } t += sum8(vT); for(int32_t j = smLen; j < nElem; j++) { int32_t idx = indices[i][j]; t += x[idx]*A[A_offset + j]; } y[i] = t; A_offset += nElem; } }
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM0 = _mm256_add_ps(YMM0, YMM15); YMM1 = _mm256_add_ps(YMM1, YMM15); _mm256_storeu_ps(y+i, YMM0); _mm256_storeu_ps(y+i+8, YMM1); } for (; i<(n); i++) { y[i] = x[i] + c; } }
__attribute__((noinline)) float dot256fma(float *x1, float *x2, size_t len) { assert(len % 8 == 0); __m256 sum = _mm256_setzero_ps(); if (len > 7) { size_t limit = len - 7; for (size_t i = 0; i < limit; i += 8) { __m256 v1 = _mm256_loadu_ps(x1 + i); __m256 v2 = _mm256_loadu_ps(x2 + i); sum = _mm256_fmadd_ps(v1, v2, sum); } } float buffer[8]; _mm256_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7]; }
void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM2 = _mm256_loadu_ps(y+i); YMM3 = _mm256_loadu_ps(y+i+8); YMM2 = _mm256_mul_ps(YMM0, YMM2); YMM3 = _mm256_mul_ps(YMM1, YMM3); _mm256_storeu_ps(z+i, YMM2); _mm256_storeu_ps(z+i+8, YMM3); } for (; i<n; i++) { z[i] = x[i] * y[i]; } }
//this method is untested as of right now.... inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){ if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){ const uint64_t *a_index = (uint64_t*) A_in->data; const uint64_t *b_index = (uint64_t*) B_in->data; uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t)); uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t)); const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const bool a_big = a_index[0] > b_index[0]; assert(a_index[0] <= b_index[0]); const uint64_t start_index = (a_big) ? a_index[0] : b_index[0]; const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]); const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0; const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a); const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index); //16 uint16_ts //8 ints //4 longs size_t i = 0; A += a_start_index; B += b_start_index; #if VECTORIZE == 1 for(; (i+3) < total_size; i += 4, A += 4, B += 4){ const __m256 a1 = _mm256_loadu_ps((const float*)A); const __m256 a2 = _mm256_loadu_ps((const float*)B); const __m256 r = _mm256_or_ps(a2, a1); _mm256_storeu_ps((float*)A, r); } #endif for(; i < total_size; i++, A++, B++){ *A |= *B; } } }
void static avx_test (void) { int i; int m[8] = {mask_v(0), mask_v(1), mask_v(2), mask_v(3), mask_v(4), mask_v(5), mask_v(6), mask_v(7)}; float s[8] = {1,2,3,4,5,6,7,8}; union256 src, mask; float e [8] = {0.0}; float d [8] = {0.0}; src.x = _mm256_loadu_ps (s); mask.x = _mm256_loadu_ps ((float *)m); _mm256_maskstore_ps (d, mask.x, src.x); for (i = 0 ; i < 8; i++) e[i] = m[i] ? s[i] : 0; if (checkVf (d, e, 8)) abort (); }
void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth]) { for (int iY = 0; iY < ciHeight; ++iY) { _mm256_zeroall(); OutputPixelType *pOutput = aaOutput[iY]; InputPixelType_1 *pInput1 = aaInput1[iY]; InputPixelType_2 *pInput2 = aaInput2[iY]; for (int iX = 0; iX < ciWidth; iX += VectorWidth) { __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX ); __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX ); _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) ); } _mm256_zeroupper(); } }
void elem_mul (float *result, float *a, float *b, int dim) { #ifdef __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_a, vec_b, vec_res; for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_a = _mm256_loadu_ps(a + i); vec_b = _mm256_loadu_ps(b + i); vec_res = _mm256_loadu_ps(result + i); vec_a = _mm256_mul_ps(vec_a, vec_b); vec_res = _mm256_add_ps(vec_res, vec_a); _mm256_storeu_ps(result + i, vec_res); } for (int i=stopSIMD; i<dim; ++i) { result[i] += a[i] * b[i]; } #endif }
void static avx_test (void) { int i; union256 u, s1, s2; int source1[8]={34545, 95567, 23443, 5675, 2323, 67, 2345, 45667}; int source2[8]={674, 57897, 93459, 45624, 54674, 1237, 67436, 79608}; int d[8]; int e[8]; s1.x = _mm256_loadu_ps ((float *)source1); s2.x = _mm256_loadu_ps ((float *)source2); u.x = _mm256_andnot_ps (s1.x, s2.x); _mm256_storeu_ps ((float *)d, u.x); for (i = 0; i < 8; i++) e[i] = (~source1[i]) & source2[i]; if (checkVi (d, e, 8)) abort (); }
inline Mbr getMbrRTreeNode(RTreeNode *node) { Mbr mbr; int k; mbr = node->mbrs[0]; for (k = 1; k < node->nchilds; k++) { #ifdef ENABLE_SSE_TEST1 __m128 v_nlow = _mm_load_ps(mbr.low); __m128 v_nupp = _mm_load_ps(mbr.upp); __m128 v_clow = _mm_load_ps(node->mbrs[k].low); __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp); _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow)); _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp)); #else #ifdef ENABLE_AVX_TEST1 __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr); __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); __m256 v_tmp; v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12); _mm256_storeu_ps((float *)&mbr, v_tmp); #else int i; for (i = 0; i < NDIMS; i++) { if (mbr.low[i] > node->mbrs[k].low[i]) mbr.low[i] = node->mbrs[k].low[i]; if (mbr.upp[i] < node->mbrs[k].upp[i]) mbr.upp[i] = node->mbrs[k].upp[i]; } #endif #endif } return mbr; }
/* sum float vectors ----------------------------------------------------------- * sum float vectors: out=data1.+data2 * args : float *data1 I input float array * float *data2 I input float array * int n I number of input data * float *out O output float array * return : none * note : AVX command is used if "AVX" is defined *-----------------------------------------------------------------------------*/ extern void sumvf(const float *data1, const float *data2, int n, float *out) { int i; #if !defined(AVX_ENABLE) for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; #else int m=n/8; __m256 xmm1,xmm2,xmm3; if (n<8) { for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; } else { for (i=0;i<8*m;i+=8) { xmm1=_mm256_loadu_ps(&data1[i]); xmm2=_mm256_loadu_ps(&data2[i]); xmm3=_mm256_add_ps(xmm1,xmm2); _mm256_storeu_ps(&out[i],xmm3); } for (;i<n;i++) out[i]=data1[i]+data2[i]; } #endif }
void reduceHistogram_helper(float* thist, float* _hist, unsigned int _histx, unsigned int _histy, unsigned int _histz, unsigned int _histdim, unsigned int _threads) { // Sum histograms // (Could be done using parallel reduction) unsigned int i=0; #ifdef USE_AVX1 for( ;i<_histdim; i+=8 ) { float *th = thist+i; __m256 h1 = _mm256_loadu_ps( th ); for( unsigned int j=1; j<_threads; ++j ) { const __m256 h2 = _mm256_loadu_ps( th+j*_histdim ); h1 = _mm256_add_ps( h1, h2 ); } _mm256_storeu_ps( _hist+i, h1 ); } i-=8; #endif // Do the rest for( ;i<_histdim; i+=1 ) { _hist[i] = thist[i]; for( unsigned int j=1; j<_threads; ++j ) { _hist[i] += thist[j*_histdim+i]; } } normalizeHistogram_helper(_hist, _histx, _histy, _histz); }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { unsigned int m = n >> 3; unsigned int j; unsigned int k = n & 7; unsigned int l = n & (~7); for (j = 0; j < m; j++) { v8sf src = _mm256_loadu_ps(a + 8 * j); v8sf tem = simd_sin8f(src); _mm256_storeu_ps(y + 8 * j, tem); } for (j = 0; j < k; j++) { y[j + l] = sinf(a[j + l]); } }
void avx2_masked_csr_spmv(float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y) { int32_t A_offset = 0; __m256 bitMasks[9]; unsigned u = 0x80000000; float v = *((float*)&u); bitMasks[0] = _mm256_set_ps(0,0,0,0,0,0,0,0); bitMasks[1] = _mm256_set_ps(0,0,0,0,0,0,0,v); bitMasks[2] = _mm256_set_ps(0,0,0,0,0,0,v,v); bitMasks[3] = _mm256_set_ps(0,0,0,0,0,v,v,v); bitMasks[4] = _mm256_set_ps(0,0,0,0,v,v,v,v); bitMasks[5] = _mm256_set_ps(0,0,0,v,v,v,v,v); bitMasks[6] = _mm256_set_ps(0,0,v,v,v,v,v,v); bitMasks[7] = _mm256_set_ps(0,v,v,v,v,v,v,v); bitMasks[8] = _mm256_set_ps(v,v,v,v,v,v,v,v); const __m256 vZeros = _mm256_setzero_ps(); for(int32_t i = 0; i < n; i++) { int32_t nElem = nIdx[i]; float t = 0.0f; __m256 vT = _mm256_setzero_ps(); int32_t k = 0; while(k < nElem) { int vl = ((k+8) < nElem) ? 8 : (nElem - k); __m256 mask = bitMasks[vl]; /* this is padded out */ __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][k])); __m256 vX = _mm256_mask_i32gather_ps(vZeros,(float const*)x,vIdx,mask,4); __m256 vA = _mm256_loadu_ps(&A[A_offset + k]); vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA)); k += vl; } t += sum8(vT); y[i] = t; A_offset += nElem; } }
void tanh_deriv (float *deriv_res, float *tanh_res, int dim) { #ifdef __APPLE__ for (int i=0; i<dim; i++) { deriv_res[i] = 1 - tanh_res[i] * tanh_res[i]; } #elif __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_deriv, vec_tanh; __m256 vec_one = _mm256_set1_ps(1.f); for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_tanh = _mm256_loadu_ps(tanh_res + i); vec_deriv = _mm256_sub_ps(vec_one, _mm256_mul_ps(vec_tanh, vec_tanh)); _mm256_storeu_ps(deriv_res + i, vec_deriv); } for (int i=stopSIMD; i<dim; ++i) { deriv_res[i] = 1 - tanh_res[i] * tanh_res[i]; } #endif }
IGL_INLINE void igl::svd3x3_avx( const Eigen::Matrix<T, 3*8, 3>& A, Eigen::Matrix<T, 3*8, 3> &U, Eigen::Matrix<T, 3*8, 1> &S, Eigen::Matrix<T, 3*8, 3>&V) { // this code assumes USE_AVX_IMPLEMENTATION is defined float Ashuffle[9][8], Ushuffle[9][8], Vshuffle[9][8], Sshuffle[3][8]; for (int i=0; i<3; i++) { for (int j=0; j<3; j++) { for (int k=0; k<8; k++) { Ashuffle[i + j*3][k] = A(i + 3*k, j); } } } #include "Singular_Value_Decomposition_Kernel_Declarations.hpp" ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(Ashuffle[0]);)
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) { int i = 0, k; for (; i <= width - 8; i += 8) { const float* src = src0 + i; __m256 f, x0; __m256 s0 = _mm256_set1_ps(0.0f); for (k = 0; k < _ksize; k++, src += cn) { f = _mm256_set1_ps(_kx[k]); x0 = _mm256_loadu_ps(src); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
void sigm_deriv (float *deriv_res, float *sigm_res, int dim) { #ifdef __APPLE__ for (int i=0; i<dim; i++) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #elif __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_deriv, vec_sigm; __m256 vec_one = _mm256_set1_ps(1.f); for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_sigm = _mm256_loadu_ps(sigm_res + i); vec_deriv = _mm256_mul_ps(vec_sigm, _mm256_sub_ps(vec_one, vec_sigm)); _mm256_storeu_ps(deriv_res + i, vec_deriv); } for (int i=stopSIMD; i<dim; ++i) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #endif }
__m256 atan2_256( const __m256& y, const __m256& x) { //! For convenience float a[8]; float b[8]; _mm256_storeu_ps(a, x); _mm256_storeu_ps(b, y); //! Compute the arc tangent a[0] = atan2(b[0], a[0]); a[1] = atan2(b[1], a[1]); a[2] = atan2(b[2], a[2]); a[3] = atan2(b[3], a[3]); a[4] = atan2(b[4], a[4]); a[5] = atan2(b[5], a[5]); a[6] = atan2(b[6], a[6]); a[7] = atan2(b[7], a[7]); //! Get the result return _mm256_loadu_ps(a); }
float Ashuffle[9][8], Ushuffle[9][8], Vshuffle[9][8], Sshuffle[3][8]; for (int i=0; i<3; i++) { for (int j=0; j<3; j++) { for (int k=0; k<8; k++) { Ashuffle[i + j*3][k] = A(i + 3*k, j); } } } #include "Singular_Value_Decomposition_Kernel_Declarations.hpp" ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(Ashuffle[0]);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_loadu_ps(Ashuffle[1]);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_loadu_ps(Ashuffle[2]);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_loadu_ps(Ashuffle[3]);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_loadu_ps(Ashuffle[4]);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_loadu_ps(Ashuffle[5]);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_loadu_ps(Ashuffle[6]);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_loadu_ps(Ashuffle[7]);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_loadu_ps(Ashuffle[8]);) #include "Singular_Value_Decomposition_Main_Kernel_Body.hpp" ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[0],Vu11);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[1],Vu21);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[2],Vu31);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[3],Vu12);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[4],Vu22);)
void softmax_compute_block( float* &input_ptr, float* &output_ptr, __m256 &acc_sum) { // We are not using table of registers and unroll pragmas // due to compiler which have issues with register allocation // and needs special, obvious treatment. Template immediate // arguments matching will remove all conditions in this code. __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15; // Load inputs and perform e^x if (T_SIZE >= 1) acc0 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 0 * C_batch_size)); if (T_SIZE >= 2) acc1 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 1 * C_batch_size)); if (T_SIZE >= 3) acc2 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 2 * C_batch_size)); if (T_SIZE >= 4) acc3 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 3 * C_batch_size)); if (T_SIZE >= 5) acc4 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 4 * C_batch_size)); if (T_SIZE >= 6) acc5 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 5 * C_batch_size)); if (T_SIZE >= 7) acc6 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 6 * C_batch_size)); if (T_SIZE >= 8) acc7 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 7 * C_batch_size)); if (T_SIZE >= 9) acc8 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 8 * C_batch_size)); if (T_SIZE >= 10) acc9 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 9 * C_batch_size)); if (T_SIZE >= 11) acc10 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 10 * C_batch_size)); if (T_SIZE >= 12) acc11 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 11 * C_batch_size)); if (T_SIZE >= 13) acc12 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 12 * C_batch_size)); if (T_SIZE >= 14) acc13 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 13 * C_batch_size)); if (T_SIZE >= 15) acc14 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 14 * C_batch_size)); // Store results. if (T_SIZE >= 1) _mm256_storeu_ps(output_ptr + 0 * C_batch_size, acc0); if (T_SIZE >= 2) _mm256_storeu_ps(output_ptr + 1 * C_batch_size, acc1); if (T_SIZE >= 3) _mm256_storeu_ps(output_ptr + 2 * C_batch_size, acc2); if (T_SIZE >= 4) _mm256_storeu_ps(output_ptr + 3 * C_batch_size, acc3); if (T_SIZE >= 5) _mm256_storeu_ps(output_ptr + 4 * C_batch_size, acc4); if (T_SIZE >= 6) _mm256_storeu_ps(output_ptr + 5 * C_batch_size, acc5); if (T_SIZE >= 7) _mm256_storeu_ps(output_ptr + 6 * C_batch_size, acc6); if (T_SIZE >= 8) _mm256_storeu_ps(output_ptr + 7 * C_batch_size, acc7); if (T_SIZE >= 9) _mm256_storeu_ps(output_ptr + 8 * C_batch_size, acc8); if (T_SIZE >= 10) _mm256_storeu_ps(output_ptr + 9 * C_batch_size, acc9); if (T_SIZE >= 11) _mm256_storeu_ps(output_ptr + 10 * C_batch_size, acc10); if (T_SIZE >= 12) _mm256_storeu_ps(output_ptr + 11 * C_batch_size, acc11); if (T_SIZE >= 13) _mm256_storeu_ps(output_ptr + 12 * C_batch_size, acc12); if (T_SIZE >= 14) _mm256_storeu_ps(output_ptr + 13 * C_batch_size, acc13); if (T_SIZE >= 15) _mm256_storeu_ps(output_ptr + 14 * C_batch_size, acc14); // Sum up accumulators. if (T_SIZE >= 1) acc_sum = _mm256_add_ps(acc0, acc_sum); if (T_SIZE >= 2) acc_sum = _mm256_add_ps(acc1, acc_sum); if (T_SIZE >= 3) acc_sum = _mm256_add_ps(acc2, acc_sum); if (T_SIZE >= 4) acc_sum = _mm256_add_ps(acc3, acc_sum); if (T_SIZE >= 5) acc_sum = _mm256_add_ps(acc4, acc_sum); if (T_SIZE >= 6) acc_sum = _mm256_add_ps(acc5, acc_sum); if (T_SIZE >= 7) acc_sum = _mm256_add_ps(acc6, acc_sum); if (T_SIZE >= 8) acc_sum = _mm256_add_ps(acc7, acc_sum); if (T_SIZE >= 9) acc_sum = _mm256_add_ps(acc8, acc_sum); if (T_SIZE >= 10) acc_sum = _mm256_add_ps(acc9, acc_sum); if (T_SIZE >= 11) acc_sum = _mm256_add_ps(acc10, acc_sum); if (T_SIZE >= 12) acc_sum = _mm256_add_ps(acc11, acc_sum); if (T_SIZE >= 13) acc_sum = _mm256_add_ps(acc12, acc_sum); if (T_SIZE >= 14) acc_sum = _mm256_add_ps(acc13, acc_sum); if (T_SIZE >= 15) acc_sum = _mm256_add_ps(acc14, acc_sum); input_ptr += C_batch_size*T_SIZE; output_ptr += C_batch_size*T_SIZE; }
static void inline sse_sum_unaligned_F(int count, float *partial, float *accum) { #if MANUAL_SSE #if USE_AVX int i; __m256 v8in, v8acc; //fprintf(stderr, "unaligned %d %p %p\n", i, partial, accum); for(i=0; ((long)&(accum[i]) & 0x0f) && (i<count);i++) { accum[i]+=partial[i]; } PRAGMA_IVDEP for(;i<(count-7);i+=8) { v8in=_mm256_loadu_ps(&(partial[i])); v8acc=_mm256_load_ps(&(accum[i])); v8acc=_mm256_add_ps(v8acc, v8in); _mm256_store_ps(&(accum[i]), v8acc); } PRAGMA_IVDEP for(;i<count;i++) { accum[i]+=partial[i]; } #else int i; __m128 v4in, v4acc; float *tmp=aligned_alloca(16*sizeof(*tmp)); for(i=0; ((long)&(accum[i]) & 0x0f) && (i<count);i++) { accum[i]+=partial[i]; } //fprintf(stderr, "unaligned %d %p %p\n", i, partial, accum); PRAGMA_IVDEP for(;i<(count-15);i+=16) { memcpy(tmp, &(partial[i]), 16*sizeof(*tmp)); v4in=_mm_load_ps(tmp); v4acc=_mm_load_ps(&(accum[i])); v4acc=_mm_add_ps(v4acc, v4in); _mm_store_ps(&(accum[i]), v4acc); v4in=_mm_load_ps(&(tmp[4])); v4acc=_mm_load_ps(&(accum[i+4])); v4acc=_mm_add_ps(v4acc, v4in); _mm_store_ps(&(accum[i+4]), v4acc); v4in=_mm_load_ps(&(tmp[8])); v4acc=_mm_load_ps(&(accum[i+8])); v4acc=_mm_add_ps(v4acc, v4in); _mm_store_ps(&(accum[i+8]), v4acc); v4in=_mm_load_ps(&(tmp[12])); v4acc=_mm_load_ps(&(accum[i+12])); v4acc=_mm_add_ps(v4acc, v4in); _mm_store_ps(&(accum[i+12]), v4acc); } PRAGMA_IVDEP for(;i<count;i++) { accum[i]+=partial[i]; } #endif #else fprintf(stderr, "**** MANUAL_SSE disabled in %s\n", __FUNCTION__); exit(-2); #endif }