extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) { for(int i=0; i<n; i++) { __m256 t1 = _mm256_loadu_ps(&c[i*n + 0]); __m256 t2 = _mm256_loadu_ps(&c[i*n + 8]); __m256 t3 = _mm256_loadu_ps(&c[i*n + 16]); __m256 t4 = _mm256_loadu_ps(&c[i*n + 24]); for(int k=0; k<n; k++) { __m256 a1 = _mm256_set1_ps(a[k*n+i]); __m256 b1 = _mm256_loadu_ps(&b[k*n+0]); t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1)); __m256 b2 = _mm256_loadu_ps(&b[k*n+8]); t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2)); __m256 b3 = _mm256_loadu_ps(&b[k*n+16]); t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3)); __m256 b4 = _mm256_loadu_ps(&b[k*n+24]); t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4)); } _mm256_storeu_ps(&c[i*n + 0], t1); _mm256_storeu_ps(&c[i*n + 8], t2); _mm256_storeu_ps(&c[i*n + 16], t3); _mm256_storeu_ps(&c[i*n + 24], t4); } }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { VMLLONG i=0; VMLLONG loop_count=(n) >> 5; VMLLONG remain_count=(n) & 0x1f; while(loop_count>0){ __m256 av0=_mm256_loadu_ps(a); __m256 av1=_mm256_loadu_ps(a+8); __m256 av2=_mm256_loadu_ps(a+16); __m256 av3=_mm256_loadu_ps(a+24); __m256 yv0=exp256_ps(av0); __m256 yv1=exp256_ps(av1); __m256 yv2=exp256_ps(av2); __m256 yv3=exp256_ps(av3); _mm256_storeu_ps(y, yv0); _mm256_storeu_ps(y+8, yv1); _mm256_storeu_ps(y+16, yv2); _mm256_storeu_ps(y+24, yv3); a+=32; b+=32; y+=32; loop_count--; } for(i=0; i<n; i++){ y[i]=expf(a[i]); } }
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for (; i <= width - 16; i += 16) { __m256 f, s0 = d8, s1 = d8; __m256 x0; S = src[0] + i; for (k = 1; k <= ksize2; k++) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for (; i <= width - 4; i += 4) { __m128 f, x0, s0 = d4; for (k = 1; k <= ksize2; k++) { f = _mm_set1_ps(ky[k]); x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; for (i=0; i<=((n)-16); i+=16) { _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i)); _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8)); } off = (n) - ((n)%16); for (i=0; i<((n)%16); i++) { y[off+i] = x[off+i]; } }
void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c); for (i=0; i<=((n)-32); i+=32) { _mm256_storeu_ps((x)+i , YMM0); _mm256_storeu_ps((x)+i+8, YMM0); _mm256_storeu_ps((x)+i+16, YMM0); _mm256_storeu_ps((x)+i+24, YMM0); } off = (n) - ((n)%32); for (i=0; i<((n)%32); i++) { x[off+i] = c; } }
float avx_dot_product(std::vector<float> &av, std::vector<float> &bv) { /* Get SIMD-vector pointers to the start of each vector */ unsigned int niters = av.size() / 8; float *a = (float *) aligned_alloc(32, av.size()*sizeof(float)); float *b = (float *) aligned_alloc(32, av.size()*sizeof(float)); memcpy(a,&av[0],av.size()*sizeof(float)); memcpy(b,&bv[0],bv.size()*sizeof(float)); __m256 *ptrA = (__m256*) &a[0], *ptrB = (__m256*) &b[0]; __m256 res = _mm256_set1_ps(0.0); for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++) res = _mm256_add_ps(_mm256_dp_ps(*ptrA, *ptrB, 255), res); /* Get result back from the SIMD vector */ float fres[8]; _mm256_storeu_ps (fres, res); int q = 8 * niters; for (unsigned int i = 0; i < av.size() % 8; i++) fres[0] += (a[i+q]*b[i+q]); free(a); free(b); return fres[0] + fres[4]; }
void DoubleToComplex(double *srcI, double *srcQ, Complex *dst, const unsigned int len) { __m256d avxR_D, avxI_D, avxX_D, avxY_D, avxA_D, avxB_D; __m128 avxA, avxB; #if 1 __m256 avxD; #endif for (unsigned int i=0; i+4<=len; i+=4) { avxR_D = _mm256_loadu_pd(srcI + i); avxI_D = _mm256_loadu_pd(srcQ + i); avxX_D = _mm256_unpacklo_pd(avxR_D, avxI_D); //swizzle avxY_D = _mm256_unpackhi_pd(avxR_D, avxI_D); avxA_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x20); avxB_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x31); avxA = _mm256_cvtpd_ps(avxA_D); //double to float avxB = _mm256_cvtpd_ps(avxB_D); #if 0 avxD = _mm256_castps128_ps256(avxA); avxD = _mm256_insertf128_ps(avxD, avxB, 1); _mm256_storeu_ps((float*)(dst+i), avxD); #else _mm_maskstore_ps((float*)(dst+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxA); _mm_maskstore_ps((float*)(dst+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxB); #endif } for (unsigned int i=len-(len & 0x03); i<len; ++i) { dst[i].m_real = static_cast<float>(srcI[i]); dst[i].m_imag = static_cast<float>(srcQ[i]); } }
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM0 = _mm256_add_ps(YMM0, YMM15); YMM1 = _mm256_add_ps(YMM1, YMM15); _mm256_storeu_ps(y+i, YMM0); _mm256_storeu_ps(y+i+8, YMM1); } for (; i<(n); i++) { y[i] = x[i] + c; } }
void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM2 = _mm256_loadu_ps(y+i); YMM3 = _mm256_loadu_ps(y+i+8); YMM2 = _mm256_mul_ps(YMM0, YMM2); YMM3 = _mm256_mul_ps(YMM1, YMM3); _mm256_storeu_ps(z+i, YMM2); _mm256_storeu_ps(z+i+8, YMM3); } for (; i<n; i++) { z[i] = x[i] * y[i]; } }
static void _mm256_print_ps(__m256 x) { float *data = new float[8]; _mm256_storeu_ps(&data[0],x); for(size_t i =0 ; i < 8; i++) { std::cout << "Data[" << i << "]: " << data[i] << std::endl; } delete[] data; }
void MaddMemcpy(float* arg1, float* arg2, float* arg3, int size1, int size2, float* result) { memcpy(arg2, arg1, size1); memcpy(arg3, arg1, size2); __m256 vec1 = _mm256_load_ps(arg1); __m256 vec2 = _mm256_load_ps(arg2); __m256 vec3 = _mm256_load_ps(arg3); __m256 res = _mm256_fmadd_ps(vec1, vec2, vec3); _mm256_storeu_ps(result, res); }
void UnalignedAvxMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 8) { __m256 v1 = _mm256_loadu_ps(&a[i]); __m256 v2 = _mm256_loadu_ps(&b[i]); __m256 r = _mm256_mul_ps(v1, v2); _mm256_storeu_ps(&d[i], r); } }
extern "C" void product64x64_avx(float *a, float *b, float *c, int n) { for(int i=0; i<n; i++) { __m256 t1 = _mm256_loadu_ps(&c[i*n + 0]); __m256 t2 = _mm256_loadu_ps(&c[i*n + 8]); __m256 t3 = _mm256_loadu_ps(&c[i*n + 16]); __m256 t4 = _mm256_loadu_ps(&c[i*n + 24]); __m256 t5 = _mm256_loadu_ps(&c[i*n + 32]); __m256 t6 = _mm256_loadu_ps(&c[i*n + 40]); __m256 t7 = _mm256_loadu_ps(&c[i*n + 48]); __m256 t8 = _mm256_loadu_ps(&c[i*n + 56]); for(int k=0; k<n; k++) { __m256 a1 = _mm256_set1_ps(a[k*n+i]); __m256 b1 = _mm256_loadu_ps(&b[k*n+0]); t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1)); __m256 b2 = _mm256_loadu_ps(&b[k*n+8]); t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2)); __m256 b3 = _mm256_loadu_ps(&b[k*n+16]); t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3)); __m256 b4 = _mm256_loadu_ps(&b[k*n+24]); t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4)); __m256 b5 = _mm256_loadu_ps(&b[k*n+32]); t5 = _mm256_sub_ps(t5,_mm256_mul_ps(a1,b5)); __m256 b6 = _mm256_loadu_ps(&b[k*n+40]); t6 = _mm256_sub_ps(t6,_mm256_mul_ps(a1,b6)); __m256 b7 = _mm256_loadu_ps(&b[k*n+48]); t7 = _mm256_sub_ps(t7,_mm256_mul_ps(a1,b7)); __m256 b8 = _mm256_loadu_ps(&b[k*n+56]); t8 = _mm256_sub_ps(t8,_mm256_mul_ps(a1,b8)); } _mm256_storeu_ps(&c[i*n + 0], t1); _mm256_storeu_ps(&c[i*n + 8], t2); _mm256_storeu_ps(&c[i*n + 16], t3); _mm256_storeu_ps(&c[i*n + 24], t4); _mm256_storeu_ps(&c[i*n + 32], t5); _mm256_storeu_ps(&c[i*n + 40], t6); _mm256_storeu_ps(&c[i*n + 48], t7); _mm256_storeu_ps(&c[i*n + 56], t8); } }
void convertCAVX(int num, uint8_t *in, float *out){ int i; __m256 sub = _mm256_set1_ps(128.0); __m256 mul = _mm256_set1_ps(1/128.0); for(i=0; i<num; i+=8){ __m128i val = _mm_loadu_si128((__m128i *)(in + i)); __m256i ints = _mm256_cvtepu8_epi32(val); __m256 cvtd = _mm256_cvtepi32_ps(ints); __m256 res = _mm256_mul_ps(_mm256_sub_ps(cvtd, sub), mul); _mm256_storeu_ps(out + i, res); } }
__m256 atan2_256( const __m256& y, const __m256& x) { //! For convenience float a[8]; float b[8]; _mm256_storeu_ps(a, x); _mm256_storeu_ps(b, y); //! Compute the arc tangent a[0] = atan2(b[0], a[0]); a[1] = atan2(b[1], a[1]); a[2] = atan2(b[2], a[2]); a[3] = atan2(b[3], a[3]); a[4] = atan2(b[4], a[4]); a[5] = atan2(b[5], a[5]); a[6] = atan2(b[6], a[6]); a[7] = atan2(b[7], a[7]); //! Get the result return _mm256_loadu_ps(a); }
/* Adjust MBR to fit all child MBRs */ inline void adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur) { ArraySTRNode *node, *child; ulong_t k; node = &nodes[cur]; child = &nodes[node->pos]; /* enlarge mbr to include all childlen's mbr */ #ifdef ENABLE_SSE_ADJUST { __m128 v_nlow = _mm_load_ps(child[0].mbr.low); __m128 v_nupp = _mm_load_ps(child[0].mbr.upp); for (k = 1; k < node->len; k++) { v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low)); v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp)); } _mm_store_ps(node->mbr.low, v_nlow); _mm_store_ps(node->mbr.upp, v_nupp); } #else #ifdef ENABLE_AVX_TEST1 { __m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr); for (k = 1; k < node->len; k++) { __m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12); } _mm256_storeu_ps((float *)&node->mbr, v_nmbr); } #else /* copy first child's mbr */ node->mbr = child[0].mbr; for (k = 1; k < node->len; k++) { int i; for (i = 0; i < NDIMS; i++) { if (node->mbr.low[i] > child[k].mbr.low[i]) node->mbr.low[i] = child[k].mbr.low[i]; if (node->mbr.upp[i] < child[k].mbr.upp[i]) node->mbr.upp[i] = child[k].mbr.upp[i]; } } #endif #endif }
void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_ps(y+i); YMM1 = _mm256_loadu_ps(x+i); YMM2 = _mm256_mul_ps(YMM0, YMM15); YMM3 = _mm256_add_ps(YMM1, YMM2); _mm256_storeu_ps(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
__attribute__((noinline)) float dot256fma(float *x1, float *x2, size_t len) { assert(len % 8 == 0); __m256 sum = _mm256_setzero_ps(); if (len > 7) { size_t limit = len - 7; for (size_t i = 0; i < limit; i += 8) { __m256 v1 = _mm256_loadu_ps(x1 + i); __m256 v2 = _mm256_loadu_ps(x2 + i); sum = _mm256_fmadd_ps(v1, v2, sum); } } float buffer[8]; _mm256_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7]; }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { unsigned int m = n >> 3; unsigned int j; unsigned int k = n & 7; unsigned int l = n & (~7); for (j = 0; j < m; j++) { v8sf src = _mm256_loadu_ps(a + 8 * j); v8sf tem = simd_sin8f(src); _mm256_storeu_ps(y + 8 * j, tem); } for (j = 0; j < k; j++) { y[j + l] = sinf(a[j + l]); } }
int main(void) { float out[8]; __m256 a=_mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); __m256 b=_mm256_setr_ps(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f); __m256 dst = _mm256_sub_ps(a, b); _mm256_storeu_ps(out, dst); for(int i=0; i<sizeof(out)/sizeof(out[0]); i++) printf("out[%d]=%5.1f\n", i, out[i]); return 0; }
//this method is untested as of right now.... inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){ if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){ const uint64_t *a_index = (uint64_t*) A_in->data; const uint64_t *b_index = (uint64_t*) B_in->data; uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t)); uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t)); const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const bool a_big = a_index[0] > b_index[0]; assert(a_index[0] <= b_index[0]); const uint64_t start_index = (a_big) ? a_index[0] : b_index[0]; const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]); const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0; const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a); const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index); //16 uint16_ts //8 ints //4 longs size_t i = 0; A += a_start_index; B += b_start_index; #if VECTORIZE == 1 for(; (i+3) < total_size; i += 4, A += 4, B += 4){ const __m256 a1 = _mm256_loadu_ps((const float*)A); const __m256 a2 = _mm256_loadu_ps((const float*)B); const __m256 r = _mm256_or_ps(a2, a1); _mm256_storeu_ps((float*)A, r); } #endif for(; i < total_size; i++, A++, B++){ *A |= *B; } } }
void elem_mul (float *result, float *a, float *b, int dim) { #ifdef __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_a, vec_b, vec_res; for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_a = _mm256_loadu_ps(a + i); vec_b = _mm256_loadu_ps(b + i); vec_res = _mm256_loadu_ps(result + i); vec_a = _mm256_mul_ps(vec_a, vec_b); vec_res = _mm256_add_ps(vec_res, vec_a); _mm256_storeu_ps(result + i, vec_res); } for (int i=stopSIMD; i<dim; ++i) { result[i] += a[i] * b[i]; } #endif }
void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth]) { for (int iY = 0; iY < ciHeight; ++iY) { _mm256_zeroall(); OutputPixelType *pOutput = aaOutput[iY]; InputPixelType_1 *pInput1 = aaInput1[iY]; InputPixelType_2 *pInput2 = aaInput2[iY]; for (int iX = 0; iX < ciWidth; iX += VectorWidth) { __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX ); __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX ); _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) ); } _mm256_zeroupper(); } }
void static avx_test (void) { int i; union256 u, s1, s2; int source1[8]={34545, 95567, 23443, 5675, 2323, 67, 2345, 45667}; int source2[8]={674, 57897, 93459, 45624, 54674, 1237, 67436, 79608}; int d[8]; int e[8]; s1.x = _mm256_loadu_ps ((float *)source1); s2.x = _mm256_loadu_ps ((float *)source2); u.x = _mm256_andnot_ps (s1.x, s2.x); _mm256_storeu_ps ((float *)d, u.x); for (i = 0; i < 8; i++) e[i] = (~source1[i]) & source2[i]; if (checkVi (d, e, 8)) abort (); }
void tanh_deriv (float *deriv_res, float *tanh_res, int dim) { #ifdef __APPLE__ for (int i=0; i<dim; i++) { deriv_res[i] = 1 - tanh_res[i] * tanh_res[i]; } #elif __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_deriv, vec_tanh; __m256 vec_one = _mm256_set1_ps(1.f); for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_tanh = _mm256_loadu_ps(tanh_res + i); vec_deriv = _mm256_sub_ps(vec_one, _mm256_mul_ps(vec_tanh, vec_tanh)); _mm256_storeu_ps(deriv_res + i, vec_deriv); } for (int i=stopSIMD; i<dim; ++i) { deriv_res[i] = 1 - tanh_res[i] * tanh_res[i]; } #endif }
inline Mbr getMbrRTreeNode(RTreeNode *node) { Mbr mbr; int k; mbr = node->mbrs[0]; for (k = 1; k < node->nchilds; k++) { #ifdef ENABLE_SSE_TEST1 __m128 v_nlow = _mm_load_ps(mbr.low); __m128 v_nupp = _mm_load_ps(mbr.upp); __m128 v_clow = _mm_load_ps(node->mbrs[k].low); __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp); _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow)); _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp)); #else #ifdef ENABLE_AVX_TEST1 __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr); __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); __m256 v_tmp; v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12); _mm256_storeu_ps((float *)&mbr, v_tmp); #else int i; for (i = 0; i < NDIMS; i++) { if (mbr.low[i] > node->mbrs[k].low[i]) mbr.low[i] = node->mbrs[k].low[i]; if (mbr.upp[i] < node->mbrs[k].upp[i]) mbr.upp[i] = node->mbrs[k].upp[i]; } #endif #endif } return mbr; }
void sigm_deriv (float *deriv_res, float *sigm_res, int dim) { #ifdef __APPLE__ for (int i=0; i<dim; i++) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #elif __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_deriv, vec_sigm; __m256 vec_one = _mm256_set1_ps(1.f); for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_sigm = _mm256_loadu_ps(sigm_res + i); vec_deriv = _mm256_mul_ps(vec_sigm, _mm256_sub_ps(vec_one, vec_sigm)); _mm256_storeu_ps(deriv_res + i, vec_deriv); } for (int i=stopSIMD; i<dim; ++i) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #endif }
/* sum float vectors ----------------------------------------------------------- * sum float vectors: out=data1.+data2 * args : float *data1 I input float array * float *data2 I input float array * int n I number of input data * float *out O output float array * return : none * note : AVX command is used if "AVX" is defined *-----------------------------------------------------------------------------*/ extern void sumvf(const float *data1, const float *data2, int n, float *out) { int i; #if !defined(AVX_ENABLE) for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; #else int m=n/8; __m256 xmm1,xmm2,xmm3; if (n<8) { for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; } else { for (i=0;i<8*m;i+=8) { xmm1=_mm256_loadu_ps(&data1[i]); xmm2=_mm256_loadu_ps(&data2[i]); xmm3=_mm256_add_ps(xmm1,xmm2); _mm256_storeu_ps(&out[i],xmm3); } for (;i<n;i++) out[i]=data1[i]+data2[i]; } #endif }
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) { int i = 0, k; for (; i <= width - 8; i += 8) { const float* src = src0 + i; __m256 f, x0; __m256 s0 = _mm256_set1_ps(0.0f); for (k = 0; k < _ksize; k++, src += cn) { f = _mm256_set1_ps(_kx[k]); x0 = _mm256_loadu_ps(src); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
void reduceHistogram_helper(float* thist, float* _hist, unsigned int _histx, unsigned int _histy, unsigned int _histz, unsigned int _histdim, unsigned int _threads) { // Sum histograms // (Could be done using parallel reduction) unsigned int i=0; #ifdef USE_AVX1 for( ;i<_histdim; i+=8 ) { float *th = thist+i; __m256 h1 = _mm256_loadu_ps( th ); for( unsigned int j=1; j<_threads; ++j ) { const __m256 h2 = _mm256_loadu_ps( th+j*_histdim ); h1 = _mm256_add_ps( h1, h2 ); } _mm256_storeu_ps( _hist+i, h1 ); } i-=8; #endif // Do the rest for( ;i<_histdim; i+=1 ) { _hist[i] = thist[i]; for( unsigned int j=1; j<_threads; ++j ) { _hist[i] += thist[j*_histdim+i]; } } normalizeHistogram_helper(_hist, _histx, _histy, _histz); }