Esempio n. 1
0
File: main.cpp Progetto: CCJY/coliru
extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) 
{
	for(int i=0; i<n; i++) {	
		__m256 t1 = _mm256_loadu_ps(&c[i*n +  0]);
		__m256 t2 = _mm256_loadu_ps(&c[i*n +  8]);
		__m256 t3 = _mm256_loadu_ps(&c[i*n + 16]);
		__m256 t4 = _mm256_loadu_ps(&c[i*n + 24]);		
		for(int k=0; k<n; k++) {
			__m256 a1 = _mm256_set1_ps(a[k*n+i]);
			
			__m256 b1 = _mm256_loadu_ps(&b[k*n+0]);
			t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1));
			
			__m256 b2 = _mm256_loadu_ps(&b[k*n+8]);
			t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2));

			__m256 b3 = _mm256_loadu_ps(&b[k*n+16]);
			t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3));

			__m256 b4 = _mm256_loadu_ps(&b[k*n+24]);
			t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4));			
		}
		_mm256_storeu_ps(&c[i*n +  0], t1);
		_mm256_storeu_ps(&c[i*n +  8], t2);
		_mm256_storeu_ps(&c[i*n + 16], t3);
		_mm256_storeu_ps(&c[i*n + 24], t4);
	}
}
Esempio n. 2
0
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
  VMLLONG i=0;
  VMLLONG loop_count=(n) >> 5;
  VMLLONG remain_count=(n) & 0x1f;

  while(loop_count>0){
    __m256 av0=_mm256_loadu_ps(a);
    __m256 av1=_mm256_loadu_ps(a+8);
    __m256 av2=_mm256_loadu_ps(a+16);
    __m256 av3=_mm256_loadu_ps(a+24);


    __m256 yv0=exp256_ps(av0);
    __m256 yv1=exp256_ps(av1);
    __m256 yv2=exp256_ps(av2);
    __m256 yv3=exp256_ps(av3);

    _mm256_storeu_ps(y, yv0);
    _mm256_storeu_ps(y+8, yv1);
    _mm256_storeu_ps(y+16, yv2);
    _mm256_storeu_ps(y+24, yv3);

    a+=32;
    b+=32;
    y+=32;
    loop_count--;
  }

  for(i=0; i<n; i++){
    y[i]=expf(a[i]);
  }
}
Esempio n. 3
0
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for (; i <= width - 16; i += 16)
    {
        __m256 f, s0 = d8, s1 = d8;
        __m256 x0;
        S = src[0] + i;

        for (k = 1; k <= ksize2; k++)
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for (; i <= width - 4; i += 4)
    {
        __m128 f, x0, s0 = d4;

        for (k = 1; k <= ksize2; k++)
        {
            f = _mm_set1_ps(ky[k]);
            x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}
Esempio n. 4
0
void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
  ptrdiff_t i;
  ptrdiff_t off;
  for (i=0; i<=((n)-16); i+=16) {
    _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
    _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
  }
  off = (n) - ((n)%16);
  for (i=0; i<((n)%16); i++) {
    y[off+i] = x[off+i];
  }
}
Esempio n. 5
0
void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  ptrdiff_t off;
  __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  for (i=0; i<=((n)-32); i+=32) {
    _mm256_storeu_ps((x)+i  , YMM0);
    _mm256_storeu_ps((x)+i+8, YMM0);
    _mm256_storeu_ps((x)+i+16, YMM0);
    _mm256_storeu_ps((x)+i+24, YMM0);
  }
  off = (n) - ((n)%32);
  for (i=0; i<((n)%32); i++) {
    x[off+i] = c;
  }
}
Esempio n. 6
0
    float avx_dot_product(std::vector<float> &av, std::vector<float> &bv)
    {

      /* Get SIMD-vector pointers to the start of each vector */
      unsigned int niters = av.size() / 8;

      float *a = (float *) aligned_alloc(32, av.size()*sizeof(float));
      float *b = (float *) aligned_alloc(32, av.size()*sizeof(float));
      memcpy(a,&av[0],av.size()*sizeof(float));
      memcpy(b,&bv[0],bv.size()*sizeof(float));

      __m256 *ptrA = (__m256*) &a[0], *ptrB = (__m256*) &b[0];
      __m256 res = _mm256_set1_ps(0.0);

      for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++)
        res = _mm256_add_ps(_mm256_dp_ps(*ptrA, *ptrB, 255), res);

      /* Get result back from the SIMD vector */
      float fres[8];
      _mm256_storeu_ps (fres, res);
      int q = 8 * niters;

      for (unsigned int i = 0; i < av.size() % 8; i++)
        fres[0] += (a[i+q]*b[i+q]);

      free(a);
      free(b);

      return fres[0] + fres[4];
    }
Esempio n. 7
0
void DoubleToComplex(double *srcI, double *srcQ, Complex *dst, const unsigned int len)
{
    __m256d avxR_D, avxI_D, avxX_D, avxY_D, avxA_D, avxB_D;
    __m128 avxA, avxB;
#if 1
    __m256 avxD;
#endif
    for (unsigned int i=0; i+4<=len; i+=4) {
        avxR_D = _mm256_loadu_pd(srcI + i);
        avxI_D = _mm256_loadu_pd(srcQ + i);
        avxX_D = _mm256_unpacklo_pd(avxR_D, avxI_D); //swizzle
        avxY_D = _mm256_unpackhi_pd(avxR_D, avxI_D);
        avxA_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x20);
        avxB_D = _mm256_permute2f128_pd(avxX_D, avxY_D, 0x31);
        avxA = _mm256_cvtpd_ps(avxA_D); //double to float
        avxB = _mm256_cvtpd_ps(avxB_D);
#if 0
        avxD = _mm256_castps128_ps256(avxA); 
        avxD = _mm256_insertf128_ps(avxD, avxB, 1);
        _mm256_storeu_ps((float*)(dst+i), avxD);
#else
        _mm_maskstore_ps((float*)(dst+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxA);
        _mm_maskstore_ps((float*)(dst+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1), avxB);
#endif
    }

    for (unsigned int i=len-(len & 0x03); i<len; ++i) {
        dst[i].m_real = static_cast<float>(srcI[i]);
        dst[i].m_imag = static_cast<float>(srcQ[i]);
    }
}
Esempio n. 8
0
void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM0 = _mm256_add_ps(YMM0, YMM15);
    YMM1 = _mm256_add_ps(YMM1, YMM15);
    _mm256_storeu_ps(y+i, YMM0);
    _mm256_storeu_ps(y+i+8, YMM1);
  }
  for (; i<(n); i++) {
    y[i] = x[i] + c;
  }
}
Esempio n. 9
0
void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM2 = _mm256_loadu_ps(y+i);
    YMM3 = _mm256_loadu_ps(y+i+8);
    YMM2 = _mm256_mul_ps(YMM0, YMM2);
    YMM3 = _mm256_mul_ps(YMM1, YMM3);
    _mm256_storeu_ps(z+i, YMM2);
    _mm256_storeu_ps(z+i+8, YMM3);
  }
  for (; i<n; i++) {
    z[i] = x[i] * y[i];
  }
}
Esempio n. 10
0
static void _mm256_print_ps(__m256 x) {
    float *data = new float[8];
    _mm256_storeu_ps(&data[0],x);
    for(size_t i =0 ; i < 8; i++) {
        std::cout << "Data[" << i << "]: " << data[i] << std::endl;
    }
    delete[] data;
}
Esempio n. 11
0
void MaddMemcpy(float* arg1, float* arg2, float* arg3, int size1, int size2, float* result) {
    memcpy(arg2, arg1, size1);
    memcpy(arg3, arg1, size2);
    __m256 vec1 = _mm256_load_ps(arg1);
    __m256 vec2 = _mm256_load_ps(arg2);
    __m256 vec3 = _mm256_load_ps(arg3);
    __m256 res  = _mm256_fmadd_ps(vec1, vec2, vec3);
    _mm256_storeu_ps(result, res);
}
Esempio n. 12
0
void UnalignedAvxMult(float* d, float const* a, float const* b)
{
	for(int i = 0; i < gNumFloats; i += 8)
	{
		__m256 v1 = _mm256_loadu_ps(&a[i]);
		__m256 v2 = _mm256_loadu_ps(&b[i]);
		__m256 r = _mm256_mul_ps(v1, v2);
		_mm256_storeu_ps(&d[i], r);
	}
}
Esempio n. 13
0
File: main.cpp Progetto: CCJY/coliru
extern "C" void product64x64_avx(float *a, float *b, float *c, int n) 
{
	for(int i=0; i<n; i++) {	
		__m256 t1 = _mm256_loadu_ps(&c[i*n +  0]);
		__m256 t2 = _mm256_loadu_ps(&c[i*n +  8]);
		__m256 t3 = _mm256_loadu_ps(&c[i*n + 16]);
		__m256 t4 = _mm256_loadu_ps(&c[i*n + 24]);
		__m256 t5 = _mm256_loadu_ps(&c[i*n + 32]);
		__m256 t6 = _mm256_loadu_ps(&c[i*n + 40]);
		__m256 t7 = _mm256_loadu_ps(&c[i*n + 48]);
		__m256 t8 = _mm256_loadu_ps(&c[i*n + 56]);
		for(int k=0; k<n; k++) {
			__m256 a1 = _mm256_set1_ps(a[k*n+i]);
			
			__m256 b1 = _mm256_loadu_ps(&b[k*n+0]);
			t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1));
			
			__m256 b2 = _mm256_loadu_ps(&b[k*n+8]);
			t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2));

			__m256 b3 = _mm256_loadu_ps(&b[k*n+16]);
			t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3));

			__m256 b4 = _mm256_loadu_ps(&b[k*n+24]);
			t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4));

			__m256 b5 = _mm256_loadu_ps(&b[k*n+32]);
			t5 = _mm256_sub_ps(t5,_mm256_mul_ps(a1,b5));

			__m256 b6 = _mm256_loadu_ps(&b[k*n+40]);
			t6 = _mm256_sub_ps(t6,_mm256_mul_ps(a1,b6));

			__m256 b7 = _mm256_loadu_ps(&b[k*n+48]);
			t7 = _mm256_sub_ps(t7,_mm256_mul_ps(a1,b7));

			__m256 b8 = _mm256_loadu_ps(&b[k*n+56]);
			t8 = _mm256_sub_ps(t8,_mm256_mul_ps(a1,b8));
		}
		_mm256_storeu_ps(&c[i*n +  0], t1);
		_mm256_storeu_ps(&c[i*n +  8], t2);
		_mm256_storeu_ps(&c[i*n + 16], t3);
		_mm256_storeu_ps(&c[i*n + 24], t4);
		_mm256_storeu_ps(&c[i*n + 32], t5);
		_mm256_storeu_ps(&c[i*n + 40], t6);
		_mm256_storeu_ps(&c[i*n + 48], t7);
		_mm256_storeu_ps(&c[i*n + 56], t8);
	}
}
Esempio n. 14
0
void convertCAVX(int num, uint8_t *in, float *out){
    int i;
    __m256 sub = _mm256_set1_ps(128.0);
    __m256 mul = _mm256_set1_ps(1/128.0);
    for(i=0; i<num; i+=8){
        __m128i val  = _mm_loadu_si128((__m128i *)(in + i));
        __m256i ints = _mm256_cvtepu8_epi32(val);
        __m256  cvtd = _mm256_cvtepi32_ps(ints);

        __m256  res  = _mm256_mul_ps(_mm256_sub_ps(cvtd, sub), mul);

        _mm256_storeu_ps(out + i, res);
    }
}
Esempio n. 15
0
__m256 atan2_256(
  const __m256& y,
  const __m256& x) {

  //! For convenience
  float a[8];
  float b[8];
  _mm256_storeu_ps(a, x);
  _mm256_storeu_ps(b, y);

  //! Compute the arc tangent
  a[0] = atan2(b[0], a[0]);
  a[1] = atan2(b[1], a[1]);
  a[2] = atan2(b[2], a[2]);
  a[3] = atan2(b[3], a[3]);
  a[4] = atan2(b[4], a[4]);
  a[5] = atan2(b[5], a[5]);
  a[6] = atan2(b[6], a[6]);
  a[7] = atan2(b[7], a[7]);

  //! Get the result
  return _mm256_loadu_ps(a);
}
Esempio n. 16
0
/* Adjust MBR to fit all child MBRs */
inline 
void 
adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur)
{
	ArraySTRNode *node, *child;
	ulong_t k;
	
	node = &nodes[cur];
	child = &nodes[node->pos];
	
	/* enlarge mbr to include all childlen's mbr */
#ifdef ENABLE_SSE_ADJUST 
	{
		__m128 v_nlow = _mm_load_ps(child[0].mbr.low);
		__m128 v_nupp = _mm_load_ps(child[0].mbr.upp);
		for (k = 1; k < node->len; k++) {
			v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low));
			v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp));
		}
		_mm_store_ps(node->mbr.low, v_nlow);
		_mm_store_ps(node->mbr.upp, v_nupp);
	}
#else
#ifdef ENABLE_AVX_TEST1	
	{
		__m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr);
		for (k = 1; k < node->len; k++) {
			__m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr);
			__m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
			__m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
			v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12);
		}
		_mm256_storeu_ps((float *)&node->mbr, v_nmbr);
	}
#else
	/* copy first child's mbr */
	node->mbr = child[0].mbr;
	for (k = 1; k < node->len; k++) {
		int i;
		for (i = 0; i < NDIMS; i++) {
			if (node->mbr.low[i] > child[k].mbr.low[i])
				node->mbr.low[i] = child[k].mbr.low[i];
			if (node->mbr.upp[i] < child[k].mbr.upp[i])
				node->mbr.upp[i] = child[k].mbr.upp[i];
		}
	}
#endif
#endif
}
Esempio n. 17
0
void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_ps(y+i);
    YMM1 = _mm256_loadu_ps(x+i);
    YMM2 = _mm256_mul_ps(YMM0, YMM15);
    YMM3 = _mm256_add_ps(YMM1, YMM2);
    _mm256_storeu_ps(z+i, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] + y[i] * c;
  }
}
__attribute__((noinline)) float dot256fma(float *x1, float *x2, size_t len) {
  assert(len % 8 == 0);
  __m256 sum = _mm256_setzero_ps();
  if (len > 7) {
    size_t limit = len - 7;
    for (size_t i = 0; i < limit; i += 8) {
      __m256 v1 = _mm256_loadu_ps(x1 + i);
      __m256 v2 = _mm256_loadu_ps(x2 + i);
      sum = _mm256_fmadd_ps(v1, v2, sum);
    }
  }
  float buffer[8];
  _mm256_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7];
}
Esempio n. 19
0
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
    unsigned int m = n >> 3;
    unsigned int j;
    unsigned int k = n & 7;
    unsigned int l = n & (~7);

    for (j = 0; j < m; j++) {
        v8sf src = _mm256_loadu_ps(a + 8 * j);
        v8sf tem = simd_sin8f(src);
        _mm256_storeu_ps(y + 8 * j, tem);
    }

    for (j = 0; j < k; j++) {
        y[j + l] = sinf(a[j + l]);
    }
}
Esempio n. 20
0
File: main.cpp Progetto: sclc/DPP
int
main(void)
{
    float out[8];

    __m256 a=_mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
    __m256 b=_mm256_setr_ps(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);

    __m256 dst = _mm256_sub_ps(a, b);

    _mm256_storeu_ps(out, dst);

    for(int i=0; i<sizeof(out)/sizeof(out[0]); i++)
        printf("out[%d]=%5.1f\n", i, out[i]);

    return 0;
}
Esempio n. 21
0
  //this method is untested as of right now....
 inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){
    if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){
      const uint64_t *a_index = (uint64_t*) A_in->data;
      const uint64_t *b_index = (uint64_t*) B_in->data;

      uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t));
      uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t));
      const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));
      const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));

      const bool a_big = a_index[0] > b_index[0];

      assert(a_index[0] <= b_index[0]);

      const uint64_t start_index = (a_big) ? a_index[0] : b_index[0];
      const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]);
      const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0;

      const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a);
      const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index);

      //16 uint16_ts
      //8 ints
      //4 longs
      size_t i = 0;
      A += a_start_index;
      B += b_start_index;
      #if VECTORIZE == 1
      for(; (i+3) < total_size; i += 4, A += 4, B += 4){
        const __m256 a1 = _mm256_loadu_ps((const float*)A);
        const __m256 a2 = _mm256_loadu_ps((const float*)B);
        const __m256 r = _mm256_or_ps(a2, a1);

        _mm256_storeu_ps((float*)A, r);
      }
      #endif

      for(; i < total_size; i++, A++, B++){
        *A |= *B;
      }
    }
  }
Esempio n. 22
0
void elem_mul (float *result, float *a, float *b, int dim) {
	#ifdef __linux
	int residual = dim % SIMD_WIDTH;
	int stopSIMD = dim - residual;

	__m256 vec_a, vec_b, vec_res;
	for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
		vec_a = _mm256_loadu_ps(a + i);
		vec_b = _mm256_loadu_ps(b + i);
		vec_res = _mm256_loadu_ps(result + i);

		vec_a = _mm256_mul_ps(vec_a, vec_b);
		vec_res = _mm256_add_ps(vec_res, vec_a);
		_mm256_storeu_ps(result + i, vec_res);
	}

	for (int i=stopSIMD; i<dim; ++i) {
		result[i] += a[i] * b[i];
	}
	#endif
}
  void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth])
  {
    for (int iY = 0; iY < ciHeight; ++iY)
    {
      _mm256_zeroall();

      OutputPixelType   *pOutput = aaOutput[iY];
      InputPixelType_1  *pInput1 = aaInput1[iY];
      InputPixelType_2  *pInput2 = aaInput2[iY];

      for (int iX = 0; iX < ciWidth; iX += VectorWidth)
      {
        __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX );
        __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX );

        _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) );
      }

      _mm256_zeroupper();
    }
  }
Esempio n. 24
0
void static
avx_test (void)
{
  int i;
  union256 u, s1, s2;
  int source1[8]={34545, 95567, 23443, 5675, 2323, 67, 2345, 45667};
  int source2[8]={674, 57897, 93459, 45624, 54674, 1237, 67436, 79608};
  int d[8];
  int e[8];

  s1.x = _mm256_loadu_ps ((float *)source1);
  s2.x = _mm256_loadu_ps ((float *)source2);
  u.x = _mm256_andnot_ps (s1.x, s2.x);

  _mm256_storeu_ps ((float *)d, u.x);

  for (i = 0; i < 8; i++)
    e[i] = (~source1[i]) & source2[i];

  if (checkVi (d, e, 8))
    abort ();
}
Esempio n. 25
0
void tanh_deriv (float *deriv_res, float *tanh_res, int dim) {
	#ifdef __APPLE__
		for (int i=0; i<dim; i++) {
			deriv_res[i] = 1 - tanh_res[i] * tanh_res[i];
		}
	#elif __linux
		int residual = dim % SIMD_WIDTH;
		int stopSIMD = dim - residual;

		__m256 vec_deriv, vec_tanh;
		__m256 vec_one  = _mm256_set1_ps(1.f);
		for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
			vec_tanh  = _mm256_loadu_ps(tanh_res + i);
			vec_deriv = _mm256_sub_ps(vec_one, _mm256_mul_ps(vec_tanh, vec_tanh));
			_mm256_storeu_ps(deriv_res + i, vec_deriv);
		}

		for (int i=stopSIMD; i<dim; ++i) {
			deriv_res[i] = 1 - tanh_res[i] * tanh_res[i];
		}
	#endif
}
Esempio n. 26
0
inline 
Mbr 
getMbrRTreeNode(RTreeNode *node)
{
	Mbr mbr;
	int k;
	
	mbr = node->mbrs[0];
	for (k = 1; k < node->nchilds; k++) {
#ifdef ENABLE_SSE_TEST1
                __m128 v_nlow = _mm_load_ps(mbr.low);
                __m128 v_nupp = _mm_load_ps(mbr.upp);
                __m128 v_clow = _mm_load_ps(node->mbrs[k].low);
                __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp);
                _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow));
                _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp));
#else
#ifdef ENABLE_AVX_TEST1
                __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr);
                __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]);
                __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
                __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
                __m256 v_tmp;
                v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12);
                _mm256_storeu_ps((float *)&mbr, v_tmp);
#else
		int i;
                for (i = 0; i < NDIMS; i++) {
                        if (mbr.low[i] > node->mbrs[k].low[i])
				mbr.low[i] = node->mbrs[k].low[i];
                        if (mbr.upp[i] < node->mbrs[k].upp[i])
                                mbr.upp[i] = node->mbrs[k].upp[i];
                }
#endif
#endif
	}
		return mbr;
}
Esempio n. 27
0
void sigm_deriv (float *deriv_res, float *sigm_res, int dim) {
	#ifdef __APPLE__
		for (int i=0; i<dim; i++) {
			deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]);
		} 
	#elif __linux
		int residual = dim % SIMD_WIDTH;
		int stopSIMD = dim - residual;

		__m256 vec_deriv, vec_sigm;
		__m256 vec_one  = _mm256_set1_ps(1.f);
		for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
			vec_sigm  = _mm256_loadu_ps(sigm_res + i);
			
			vec_deriv = _mm256_mul_ps(vec_sigm, _mm256_sub_ps(vec_one, vec_sigm));
			_mm256_storeu_ps(deriv_res + i, vec_deriv);
		}

		for (int i=stopSIMD; i<dim; ++i) {
			deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]);
		}
	#endif
}
Esempio n. 28
0
/* sum float vectors -----------------------------------------------------------
* sum float vectors: out=data1.+data2
* args   : float  *data1    I   input float array
*          float  *data2    I   input float array
*          int    n         I   number of input data
*          float  *out      O   output float array
* return : none
* note   : AVX command is used if "AVX" is defined
*-----------------------------------------------------------------------------*/
extern void sumvf(const float *data1, const float *data2, int n, float *out)
{
    int i;
#if !defined(AVX_ENABLE)
    for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
#else
    int m=n/8;
    __m256 xmm1,xmm2,xmm3;

    if (n<8) {
        for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
    }
    else {
        for (i=0;i<8*m;i+=8) {
            xmm1=_mm256_loadu_ps(&data1[i]);
            xmm2=_mm256_loadu_ps(&data2[i]);
            xmm3=_mm256_add_ps(xmm1,xmm2);
            _mm256_storeu_ps(&out[i],xmm3);
        }
        for (;i<n;i++)  out[i]=data1[i]+data2[i];
    }
#endif
}
Esempio n. 29
0
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize)
{
    int i = 0, k;
    for (; i <= width - 8; i += 8)
    {
        const float* src = src0 + i;
        __m256 f, x0;
        __m256 s0 = _mm256_set1_ps(0.0f);
        for (k = 0; k < _ksize; k++, src += cn)
        {
            f = _mm256_set1_ps(_kx[k]);
            x0 = _mm256_loadu_ps(src);
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
        }
        _mm256_storeu_ps(dst + i, s0);
    }
    _mm256_zeroupper();
    return i;
}
Esempio n. 30
0
void reduceHistogram_helper(float* thist,
							float* _hist,
							unsigned int _histx,
							unsigned int _histy,
							unsigned int _histz,
							unsigned int _histdim,
							unsigned int _threads) {
	
	
    // Sum histograms
    // (Could be done using parallel reduction)
    unsigned int i=0;
#ifdef USE_AVX1
    for(  ;i<_histdim; i+=8 ) {
        float *th = thist+i;
        __m256 h1 = _mm256_loadu_ps( th );
        for( unsigned int j=1; j<_threads; ++j ) {
            const __m256 h2 = _mm256_loadu_ps( th+j*_histdim );
            h1 = _mm256_add_ps( h1, h2 );
        }
        _mm256_storeu_ps( _hist+i, h1 );
    }
    i-=8;
#endif
    
    // Do the rest
    for(  ;i<_histdim; i+=1 ) {
        _hist[i] = thist[i];
        for( unsigned int j=1; j<_threads; ++j ) {
            _hist[i] += thist[j*_histdim+i];
        }
    }
	
	
	normalizeHistogram_helper(_hist, _histx, _histy, _histz);
    
}