C++ (Cpp) _mm256_loadu_ps Exemples

Exemple #1

0

Afficher le fichier

Fichier : sexp_kernel_avx2.c Projet : gitter-badger/OpenVML

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
  VMLLONG i=0;
  VMLLONG loop_count=(n) >> 5;
  VMLLONG remain_count=(n) & 0x1f;

  while(loop_count>0){
    __m256 av0=_mm256_loadu_ps(a);
    __m256 av1=_mm256_loadu_ps(a+8);
    __m256 av2=_mm256_loadu_ps(a+16);
    __m256 av3=_mm256_loadu_ps(a+24);


    __m256 yv0=exp256_ps(av0);
    __m256 yv1=exp256_ps(av1);
    __m256 yv2=exp256_ps(av2);
    __m256 yv3=exp256_ps(av3);

    _mm256_storeu_ps(y, yv0);
    _mm256_storeu_ps(y+8, yv1);
    _mm256_storeu_ps(y+16, yv2);
    _mm256_storeu_ps(y+24, yv3);

    a+=32;
    b+=32;
    y+=32;
    loop_count--;
  }

  for(i=0; i<n; i++){
    y[i]=expf(a[i]);
  }
}

Exemple #2

0

Afficher le fichier

Fichier : simd-mult.cpp Projet : cdglove/simd-perf

void UnalignedAvxMult(float* d, float const* a, float const* b)
{
	for(int i = 0; i < gNumFloats; i += 8)
	{
		__m256 v1 = _mm256_loadu_ps(&a[i]);
		__m256 v2 = _mm256_loadu_ps(&b[i]);
		__m256 r = _mm256_mul_ps(v1, v2);
		_mm256_storeu_ps(&d[i], r);
	}
}

Exemple #3

0

Afficher le fichier

Fichier : filter.avx2.cpp Projet : Aspie96/opencv

int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for (; i <= width - 16; i += 16)
    {
        __m256 f, s0 = d8, s1 = d8;
        __m256 x0;
        S = src[0] + i;

        for (k = 1; k <= ksize2; k++)
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for (; i <= width - 4; i += 4)
    {
        __m128 f, x0, s0 = d4;

        for (k = 1; k <= ksize2; k++)
        {
            f = _mm_set1_ps(ky[k]);
            x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}

Exemple #4

0

Afficher le fichier

Fichier : AVX.c Projet : smfreegard/rspamd

void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
  ptrdiff_t i;
  ptrdiff_t off;
  for (i=0; i<=((n)-16); i+=16) {
    _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
    _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
  }
  off = (n) - ((n)%16);
  for (i=0; i<((n)%16); i++) {
    y[off+i] = x[off+i];
  }
}

Exemple #5

0

Afficher le fichier

Fichier : main.cpp Projet : CCJY/coliru

extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) 
{
	for(int i=0; i<n; i++) {	
		__m256 t1 = _mm256_loadu_ps(&c[i*n +  0]);
		__m256 t2 = _mm256_loadu_ps(&c[i*n +  8]);
		__m256 t3 = _mm256_loadu_ps(&c[i*n + 16]);
		__m256 t4 = _mm256_loadu_ps(&c[i*n + 24]);		
		for(int k=0; k<n; k++) {
			__m256 a1 = _mm256_set1_ps(a[k*n+i]);
			
			__m256 b1 = _mm256_loadu_ps(&b[k*n+0]);
			t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1));
			
			__m256 b2 = _mm256_loadu_ps(&b[k*n+8]);
			t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2));

			__m256 b3 = _mm256_loadu_ps(&b[k*n+16]);
			t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3));

			__m256 b4 = _mm256_loadu_ps(&b[k*n+24]);
			t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4));			
		}
		_mm256_storeu_ps(&c[i*n +  0], t1);
		_mm256_storeu_ps(&c[i*n +  8], t2);
		_mm256_storeu_ps(&c[i*n + 16], t3);
		_mm256_storeu_ps(&c[i*n + 24], t4);
	}
}

Exemple #6

0

Afficher le fichier

Fichier : sinc_resampler.c Projet : libretro/RetroArch

static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m256 sum_l             = _mm256_setzero_ps();
    __m256 sum_r             = _mm256_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m256 delta             = _mm256_set1_ps((float)
                               (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 8)
    {
        __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
        __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m256 deltas = _mm256_load_ps(delta_table + i);
        __m256 sinc   = _mm256_add_ps(_mm256_load_ps(phase_table + i),
                                      _mm256_mul_ps(deltas, delta));
#else
        __m256 sinc   = _mm256_load_ps(phase_table + i);
#endif
        sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
        sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
    }

    /* hadd on AVX is weird, and acts on low-lanes
     * and high-lanes separately. */
    __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
    __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
    res_l        = _mm256_hadd_ps(res_l, res_l);
    res_r        = _mm256_hadd_ps(res_r, res_r);
    res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
    res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);

    /* This is optimized to mov %xmmN, [mem].
     * There doesn't seem to be any _mm256_store_ss intrinsic. */
    _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
    _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
}

Exemple #7

0

Afficher le fichier

Fichier : Node.c Projet : ryumt/SpatialJoinCountOverShells

/* Adjust MBR to fit all child MBRs */
inline 
void 
adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur)
{
	ArraySTRNode *node, *child;
	ulong_t k;
	
	node = &nodes[cur];
	child = &nodes[node->pos];
	
	/* enlarge mbr to include all childlen's mbr */
#ifdef ENABLE_SSE_ADJUST 
	{
		__m128 v_nlow = _mm_load_ps(child[0].mbr.low);
		__m128 v_nupp = _mm_load_ps(child[0].mbr.upp);
		for (k = 1; k < node->len; k++) {
			v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low));
			v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp));
		}
		_mm_store_ps(node->mbr.low, v_nlow);
		_mm_store_ps(node->mbr.upp, v_nupp);
	}
#else
#ifdef ENABLE_AVX_TEST1	
	{
		__m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr);
		for (k = 1; k < node->len; k++) {
			__m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr);
			__m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
			__m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
			v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12);
		}
		_mm256_storeu_ps((float *)&node->mbr, v_nmbr);
	}
#else
	/* copy first child's mbr */
	node->mbr = child[0].mbr;
	for (k = 1; k < node->len; k++) {
		int i;
		for (i = 0; i < NDIMS; i++) {
			if (node->mbr.low[i] > child[k].mbr.low[i])
				node->mbr.low[i] = child[k].mbr.low[i];
			if (node->mbr.upp[i] < child[k].mbr.upp[i])
				node->mbr.upp[i] = child[k].mbr.upp[i];
		}
	}
#endif
#endif
}

Exemple #8

0

Afficher le fichier

Fichier : AVX.c Projet : smfreegard/rspamd

void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-8); i+=8) {
    YMM0 = _mm256_loadu_ps(y+i);
    YMM1 = _mm256_loadu_ps(x+i);
    YMM2 = _mm256_mul_ps(YMM0, YMM15);
    YMM3 = _mm256_add_ps(YMM1, YMM2);
    _mm256_storeu_ps(z+i, YMM3);
  }
  for (; i<(n); i++) {
    z[i] = x[i] + y[i] * c;
  }
}

Exemple #9

0

Afficher le fichier

Fichier : csr.cpp Projet : dsheffie/Toys

void avx2_csr_spmv( float *A, int32_t *nIdx, int32_t **indices, float *x, int32_t n, float *y)
{
  int32_t A_offset = 0;
  for(int32_t i = 0; i < n; i++)
    {
      int32_t nElem = nIdx[i]; 
      float t = 0.0f;
      
      __m256 vT = _mm256_setzero_ps();
      int32_t smLen = nElem - (nElem & 7);

      for(int32_t j = 0; j < smLen; j+=8)
	{
	  __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][j]));
	  __m256 vX = _mm256_i32gather_ps((float const*)x,vIdx,4);
	  __m256 vA = _mm256_loadu_ps(&A[A_offset + j]);
	  vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA));

	}

      t += sum8(vT);
      for(int32_t j = smLen; j < nElem; j++)
	{
	  int32_t idx = indices[i][j];
	  t += x[idx]*A[A_offset + j];
	}

      y[i] = t;
      A_offset += nElem;
    }
}

Exemple #10

0

Afficher le fichier

Fichier : AVX.c Projet : smfreegard/rspamd

void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
  __m256 YMM0, YMM1;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM0 = _mm256_add_ps(YMM0, YMM15);
    YMM1 = _mm256_add_ps(YMM1, YMM15);
    _mm256_storeu_ps(y+i, YMM0);
    _mm256_storeu_ps(y+i+8, YMM1);
  }
  for (; i<(n); i++) {
    y[i] = x[i] + c;
  }
}

Exemple #11

0

Afficher le fichier

Fichier : dotproduct.c Projet : lemire/Code-used-on-Daniel-Lemire-s-blog

__attribute__((noinline)) float dot256fma(float *x1, float *x2, size_t len) {
  assert(len % 8 == 0);
  __m256 sum = _mm256_setzero_ps();
  if (len > 7) {
    size_t limit = len - 7;
    for (size_t i = 0; i < limit; i += 8) {
      __m256 v1 = _mm256_loadu_ps(x1 + i);
      __m256 v2 = _mm256_loadu_ps(x2 + i);
      sum = _mm256_fmadd_ps(v1, v2, sum);
    }
  }
  float buffer[8];
  _mm256_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7];
}

Exemple #12

0

Afficher le fichier

Fichier : AVX.c Projet : smfreegard/rspamd

void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
  ptrdiff_t i;
  __m256 YMM0, YMM1, YMM2, YMM3;
  for (i=0; i<=((n)-16); i+=16) {
    YMM0 = _mm256_loadu_ps(x+i);
    YMM1 = _mm256_loadu_ps(x+i+8);
    YMM2 = _mm256_loadu_ps(y+i);
    YMM3 = _mm256_loadu_ps(y+i+8);
    YMM2 = _mm256_mul_ps(YMM0, YMM2);
    YMM3 = _mm256_mul_ps(YMM1, YMM3);
    _mm256_storeu_ps(z+i, YMM2);
    _mm256_storeu_ps(z+i+8, YMM3);
  }
  for (; i<n; i++) {
    z[i] = x[i] * y[i];
  }
}

Exemple #13

0

Afficher le fichier

Fichier : union.hpp Projet : craberger/SIMDGraphProcessing

  //this method is untested as of right now....
 inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){
    if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){
      const uint64_t *a_index = (uint64_t*) A_in->data;
      const uint64_t *b_index = (uint64_t*) B_in->data;

      uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t));
      uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t));
      const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));
      const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));

      const bool a_big = a_index[0] > b_index[0];

      assert(a_index[0] <= b_index[0]);

      const uint64_t start_index = (a_big) ? a_index[0] : b_index[0];
      const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]);
      const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0;

      const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a);
      const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index);

      //16 uint16_ts
      //8 ints
      //4 longs
      size_t i = 0;
      A += a_start_index;
      B += b_start_index;
      #if VECTORIZE == 1
      for(; (i+3) < total_size; i += 4, A += 4, B += 4){
        const __m256 a1 = _mm256_loadu_ps((const float*)A);
        const __m256 a2 = _mm256_loadu_ps((const float*)B);
        const __m256 r = _mm256_or_ps(a2, a1);

        _mm256_storeu_ps((float*)A, r);
      }
      #endif

      for(; i < total_size; i++, A++, B++){
        *A |= *B;
      }
    }
  }

Exemple #14

0

Afficher le fichier

Fichier : avx-vmaskmovps-256-2.c Projet : fabio-d/xc16plusplus-source

void static
avx_test (void)
{
    int i;
    int m[8] = {mask_v(0), mask_v(1), mask_v(2), mask_v(3), mask_v(4), mask_v(5), mask_v(6), mask_v(7)};
    float s[8] = {1,2,3,4,5,6,7,8};
    union256 src, mask;
    float e [8] = {0.0};
    float d [8] = {0.0};

    src.x = _mm256_loadu_ps (s);
    mask.x = _mm256_loadu_ps ((float *)m);
    _mm256_maskstore_ps (d, mask.x, src.x);

    for (i = 0 ; i < 8; i++)
        e[i] = m[i] ? s[i] : 0;

    if (checkVf (d, e, 8))
        abort ();
}

Exemple #15

0

Afficher le fichier

Fichier : ImageAddFloatHandlersAVX.cpp Projet : ckobylko/HIPAccBenchmark

  void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth])
  {
    for (int iY = 0; iY < ciHeight; ++iY)
    {
      _mm256_zeroall();

      OutputPixelType   *pOutput = aaOutput[iY];
      InputPixelType_1  *pInput1 = aaInput1[iY];
      InputPixelType_2  *pInput2 = aaInput2[iY];

      for (int iX = 0; iX < ciWidth; iX += VectorWidth)
      {
        __m256 mmIn1 = _mm256_loadu_ps( pInput1 + iX );
        __m256 mmIn2 = _mm256_loadu_ps( pInput2 + iX );

        _mm256_storeu_ps( pOutput + iX, _mm256_add_ps(mmIn1, mmIn2) );
      }

      _mm256_zeroupper();
    }
  }

Exemple #16

0

Afficher le fichier

Fichier : write_to_same_mem.cpp Projet : fireae/DeepLSTM

void elem_mul (float *result, float *a, float *b, int dim) {
	#ifdef __linux
	int residual = dim % SIMD_WIDTH;
	int stopSIMD = dim - residual;

	__m256 vec_a, vec_b, vec_res;
	for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
		vec_a = _mm256_loadu_ps(a + i);
		vec_b = _mm256_loadu_ps(b + i);
		vec_res = _mm256_loadu_ps(result + i);

		vec_a = _mm256_mul_ps(vec_a, vec_b);
		vec_res = _mm256_add_ps(vec_res, vec_a);
		_mm256_storeu_ps(result + i, vec_res);
	}

	for (int i=stopSIMD; i<dim; ++i) {
		result[i] += a[i] * b[i];
	}
	#endif
}

Exemple #17

0

Afficher le fichier

Fichier : avx-vandnps-256-1.c Projet : 0day-ci/gcc

void static
avx_test (void)
{
  int i;
  union256 u, s1, s2;
  int source1[8]={34545, 95567, 23443, 5675, 2323, 67, 2345, 45667};
  int source2[8]={674, 57897, 93459, 45624, 54674, 1237, 67436, 79608};
  int d[8];
  int e[8];

  s1.x = _mm256_loadu_ps ((float *)source1);
  s2.x = _mm256_loadu_ps ((float *)source2);
  u.x = _mm256_andnot_ps (s1.x, s2.x);

  _mm256_storeu_ps ((float *)d, u.x);

  for (i = 0; i < 8; i++)
    e[i] = (~source1[i]) & source2[i];

  if (checkVi (d, e, 8))
    abort ();
}

Exemple #18

0

Afficher le fichier

Fichier : Node.c Projet : ryumt/SpatialJoinCountOverShells

inline 
Mbr 
getMbrRTreeNode(RTreeNode *node)
{
	Mbr mbr;
	int k;
	
	mbr = node->mbrs[0];
	for (k = 1; k < node->nchilds; k++) {
#ifdef ENABLE_SSE_TEST1
                __m128 v_nlow = _mm_load_ps(mbr.low);
                __m128 v_nupp = _mm_load_ps(mbr.upp);
                __m128 v_clow = _mm_load_ps(node->mbrs[k].low);
                __m128 v_cupp = _mm_load_ps(node->mbrs[k].upp);
                _mm_store_ps(node->mbr.low, _mm_min_ps(v_nlow, v_clow));
                _mm_store_ps(node->mbr.upp, _mm_max_ps(v_nupp, v_cupp));
#else
#ifdef ENABLE_AVX_TEST1
                __m256 v_nmbr = _mm256_loadu_ps((float *)&mbr);
                __m256 v_cmbr = _mm256_loadu_ps((float *)&node->mbrs[k]);
                __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
                __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
                __m256 v_tmp;
                v_tmp = _mm256_permute2f128_ps(v_min, v_max, 0x12);
                _mm256_storeu_ps((float *)&mbr, v_tmp);
#else
		int i;
                for (i = 0; i < NDIMS; i++) {
                        if (mbr.low[i] > node->mbrs[k].low[i])
				mbr.low[i] = node->mbrs[k].low[i];
                        if (mbr.upp[i] < node->mbrs[k].upp[i])
                                mbr.upp[i] = node->mbrs[k].upp[i];
                }
#endif
#endif
	}
		return mbr;
}

Exemple #19

0

Afficher le fichier

Fichier : sdrcmn.c Projet : Bogdanov-Alexander/GNSS-SDRLIB

/* sum float vectors -----------------------------------------------------------
* sum float vectors: out=data1.+data2
* args   : float  *data1    I   input float array
*          float  *data2    I   input float array
*          int    n         I   number of input data
*          float  *out      O   output float array
* return : none
* note   : AVX command is used if "AVX" is defined
*-----------------------------------------------------------------------------*/
extern void sumvf(const float *data1, const float *data2, int n, float *out)
{
    int i;
#if !defined(AVX_ENABLE)
    for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
#else
    int m=n/8;
    __m256 xmm1,xmm2,xmm3;

    if (n<8) {
        for (i=0;i<n;i++) out[i]=data1[i]+data2[i];
    }
    else {
        for (i=0;i<8*m;i+=8) {
            xmm1=_mm256_loadu_ps(&data1[i]);
            xmm2=_mm256_loadu_ps(&data2[i]);
            xmm3=_mm256_add_ps(xmm1,xmm2);
            _mm256_storeu_ps(&out[i],xmm3);
        }
        for (;i<n;i++)  out[i]=data1[i]+data2[i];
    }
#endif
}

Exemple #20

0

Afficher le fichier

Fichier : histogram_cpu.cpp Projet : nbergst/pvcore

void reduceHistogram_helper(float* thist,
							float* _hist,
							unsigned int _histx,
							unsigned int _histy,
							unsigned int _histz,
							unsigned int _histdim,
							unsigned int _threads) {
	
	
    // Sum histograms
    // (Could be done using parallel reduction)
    unsigned int i=0;
#ifdef USE_AVX1
    for(  ;i<_histdim; i+=8 ) {
        float *th = thist+i;
        __m256 h1 = _mm256_loadu_ps( th );
        for( unsigned int j=1; j<_threads; ++j ) {
            const __m256 h2 = _mm256_loadu_ps( th+j*_histdim );
            h1 = _mm256_add_ps( h1, h2 );
        }
        _mm256_storeu_ps( _hist+i, h1 );
    }
    i-=8;
#endif
    
    // Do the rest
    for(  ;i<_histdim; i+=1 ) {
        _hist[i] = thist[i];
        for( unsigned int j=1; j<_threads; ++j ) {
            _hist[i] += thist[j*_histdim+i];
        }
    }
	
	
	normalizeHistogram_helper(_hist, _histx, _histy, _histz);
    
}

Exemple #21

0

Afficher le fichier

Fichier : ssin_kernel_avx.c Projet : caomw/OpenVML

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
    unsigned int m = n >> 3;
    unsigned int j;
    unsigned int k = n & 7;
    unsigned int l = n & (~7);

    for (j = 0; j < m; j++) {
        v8sf src = _mm256_loadu_ps(a + 8 * j);
        v8sf tem = simd_sin8f(src);
        _mm256_storeu_ps(y + 8 * j, tem);
    }

    for (j = 0; j < k; j++) {
        y[j + l] = sinf(a[j + l]);
    }
}

Exemple #22

0

Afficher le fichier

Fichier : csr.cpp Projet : dsheffie/Toys

void avx2_masked_csr_spmv(float *A, int32_t *nIdx, int32_t **indices, 
			  float *x, int32_t n, float *y)
{
  int32_t A_offset = 0;
  __m256 bitMasks[9];
  unsigned u = 0x80000000; 
  float v = *((float*)&u);

  bitMasks[0] = _mm256_set_ps(0,0,0,0,0,0,0,0); 
  bitMasks[1] = _mm256_set_ps(0,0,0,0,0,0,0,v); 
  bitMasks[2] = _mm256_set_ps(0,0,0,0,0,0,v,v); 
  bitMasks[3] = _mm256_set_ps(0,0,0,0,0,v,v,v); 
  bitMasks[4] = _mm256_set_ps(0,0,0,0,v,v,v,v); 
  bitMasks[5] = _mm256_set_ps(0,0,0,v,v,v,v,v); 
  bitMasks[6] = _mm256_set_ps(0,0,v,v,v,v,v,v); 
  bitMasks[7] = _mm256_set_ps(0,v,v,v,v,v,v,v); 
  bitMasks[8] = _mm256_set_ps(v,v,v,v,v,v,v,v); 
  
  const __m256 vZeros = _mm256_setzero_ps();

  for(int32_t i = 0; i < n; i++)
    {
      int32_t nElem = nIdx[i]; 
      float t = 0.0f;
      __m256 vT = _mm256_setzero_ps();

      int32_t k = 0; 
      while(k < nElem)
	{
	  int vl = ((k+8) < nElem) ? 8 : (nElem - k);
	  __m256 mask = bitMasks[vl];
	  /* this is padded out */
	  __m256i vIdx = _mm256_load_si256((__m256i*)&(indices[i][k]));
	  __m256 vX = _mm256_mask_i32gather_ps(vZeros,(float const*)x,vIdx,mask,4);

	  __m256 vA = _mm256_loadu_ps(&A[A_offset + k]);
	  vT = _mm256_add_ps(vT, _mm256_mul_ps(vX,vA));
	  k += vl;
	}
      t += sum8(vT);
      y[i] = t;
      A_offset += nElem;
    }
}

Exemple #23

0

Afficher le fichier

Fichier : nonlinearity.cpp Projet : Lemma1/ParallelSGD

void tanh_deriv (float *deriv_res, float *tanh_res, int dim) {
	#ifdef __APPLE__
		for (int i=0; i<dim; i++) {
			deriv_res[i] = 1 - tanh_res[i] * tanh_res[i];
		}
	#elif __linux
		int residual = dim % SIMD_WIDTH;
		int stopSIMD = dim - residual;

		__m256 vec_deriv, vec_tanh;
		__m256 vec_one  = _mm256_set1_ps(1.f);
		for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
			vec_tanh  = _mm256_loadu_ps(tanh_res + i);
			vec_deriv = _mm256_sub_ps(vec_one, _mm256_mul_ps(vec_tanh, vec_tanh));
			_mm256_storeu_ps(deriv_res + i, vec_deriv);
		}

		for (int i=stopSIMD; i<dim; ++i) {
			deriv_res[i] = 1 - tanh_res[i] * tanh_res[i];
		}
	#endif
}

Exemple #24

0

Afficher le fichier

Fichier : svd3x3_avx.cpp Projet : ggfresh/libigl

IGL_INLINE void igl::svd3x3_avx(
  const Eigen::Matrix<T, 3*8, 3>& A, 
  Eigen::Matrix<T, 3*8, 3> &U, 
  Eigen::Matrix<T, 3*8, 1> &S, 
  Eigen::Matrix<T, 3*8, 3>&V)
{
  // this code assumes USE_AVX_IMPLEMENTATION is defined 
  float Ashuffle[9][8], Ushuffle[9][8], Vshuffle[9][8], Sshuffle[3][8];
  for (int i=0; i<3; i++)
  {
    for (int j=0; j<3; j++)
    {
      for (int k=0; k<8; k++)
      {
        Ashuffle[i + j*3][k] = A(i + 3*k, j);
      }
    }
  }

#include "Singular_Value_Decomposition_Kernel_Declarations.hpp"

  ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(Ashuffle[0]);)

Exemple #25

0

Afficher le fichier

Fichier : filter.avx2.cpp Projet : Aspie96/opencv

int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize)
{
    int i = 0, k;
    for (; i <= width - 8; i += 8)
    {
        const float* src = src0 + i;
        __m256 f, x0;
        __m256 s0 = _mm256_set1_ps(0.0f);
        for (k = 0; k < _ksize; k++, src += cn)
        {
            f = _mm256_set1_ps(_kx[k]);
            x0 = _mm256_loadu_ps(src);
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
        }
        _mm256_storeu_ps(dst + i, s0);
    }
    _mm256_zeroupper();
    return i;
}

Exemple #26

0

Afficher le fichier

Fichier : nonlinearity.cpp Projet : Lemma1/ParallelSGD

void sigm_deriv (float *deriv_res, float *sigm_res, int dim) {
	#ifdef __APPLE__
		for (int i=0; i<dim; i++) {
			deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]);
		} 
	#elif __linux
		int residual = dim % SIMD_WIDTH;
		int stopSIMD = dim - residual;

		__m256 vec_deriv, vec_sigm;
		__m256 vec_one  = _mm256_set1_ps(1.f);
		for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) {
			vec_sigm  = _mm256_loadu_ps(sigm_res + i);
			
			vec_deriv = _mm256_mul_ps(vec_sigm, _mm256_sub_ps(vec_one, vec_sigm));
			_mm256_storeu_ps(deriv_res + i, vec_deriv);
		}

		for (int i=stopSIMD; i<dim; ++i) {
			deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]);
		}
	#endif
}

Exemple #27

0

Afficher le fichier

Fichier : LibSSE.cpp Projet : cpalmann/s2p

__m256 atan2_256(
  const __m256& y,
  const __m256& x) {

  //! For convenience
  float a[8];
  float b[8];
  _mm256_storeu_ps(a, x);
  _mm256_storeu_ps(b, y);

  //! Compute the arc tangent
  a[0] = atan2(b[0], a[0]);
  a[1] = atan2(b[1], a[1]);
  a[2] = atan2(b[2], a[2]);
  a[3] = atan2(b[3], a[3]);
  a[4] = atan2(b[4], a[4]);
  a[5] = atan2(b[5], a[5]);
  a[6] = atan2(b[6], a[6]);
  a[7] = atan2(b[7], a[7]);

  //! Get the result
  return _mm256_loadu_ps(a);
}

Exemple #28

0

Afficher le fichier

Fichier : svd3x3_avx.cpp Projet : ggfresh/libigl

  float Ashuffle[9][8], Ushuffle[9][8], Vshuffle[9][8], Sshuffle[3][8];
  for (int i=0; i<3; i++)
  {
    for (int j=0; j<3; j++)
    {
      for (int k=0; k<8; k++)
      {
        Ashuffle[i + j*3][k] = A(i + 3*k, j);
      }
    }
  }

#include "Singular_Value_Decomposition_Kernel_Declarations.hpp"

  ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(Ashuffle[0]);)
  ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_loadu_ps(Ashuffle[1]);)
  ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_loadu_ps(Ashuffle[2]);)
  ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_loadu_ps(Ashuffle[3]);)
  ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_loadu_ps(Ashuffle[4]);)
  ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_loadu_ps(Ashuffle[5]);)
  ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_loadu_ps(Ashuffle[6]);)
  ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_loadu_ps(Ashuffle[7]);)
  ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_loadu_ps(Ashuffle[8]);)

#include "Singular_Value_Decomposition_Main_Kernel_Body.hpp"

  ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[0],Vu11);)
  ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[1],Vu21);)
  ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[2],Vu31);)
  ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[3],Vu12);)
  ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[4],Vu22);)

Exemple #29

0

Afficher le fichier

Fichier : layer_softmax_int32_float_avx2.cpp Projet : basicsbeauty/idlf

    void softmax_compute_block(
        float* &input_ptr,
        float* &output_ptr,
        __m256 &acc_sum)
    {
        // We are not using table of registers and unroll pragmas
        // due to compiler which have issues with register allocation
        // and needs special, obvious treatment. Template immediate
        // arguments matching will remove all conditions in this code.
        __m256  acc0, acc1, acc2, acc3, acc4,
            acc5, acc6, acc7, acc8, acc9,
            acc10, acc11, acc12, acc13, acc14, acc15;

        // Load inputs and perform e^x
        if (T_SIZE >= 1)  acc0 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 0 * C_batch_size));
        if (T_SIZE >= 2)  acc1 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 1 * C_batch_size));
        if (T_SIZE >= 3)  acc2 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 2 * C_batch_size));
        if (T_SIZE >= 4)  acc3 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 3 * C_batch_size));
        if (T_SIZE >= 5)  acc4 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 4 * C_batch_size));
        if (T_SIZE >= 6)  acc5 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 5 * C_batch_size));
        if (T_SIZE >= 7)  acc6 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 6 * C_batch_size));
        if (T_SIZE >= 8)  acc7 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 7 * C_batch_size));
        if (T_SIZE >= 9)  acc8 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 8 * C_batch_size));
        if (T_SIZE >= 10)  acc9 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 9 * C_batch_size));
        if (T_SIZE >= 11) acc10 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 10 * C_batch_size));
        if (T_SIZE >= 12) acc11 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 11 * C_batch_size));
        if (T_SIZE >= 13) acc12 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 12 * C_batch_size));
        if (T_SIZE >= 14) acc13 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 13 * C_batch_size));
        if (T_SIZE >= 15) acc14 = _inner_mm256_exp_ps1(_mm256_loadu_ps(input_ptr + 14 * C_batch_size));

        // Store results.
        if (T_SIZE >= 1) _mm256_storeu_ps(output_ptr + 0 * C_batch_size, acc0);
        if (T_SIZE >= 2) _mm256_storeu_ps(output_ptr + 1 * C_batch_size, acc1);
        if (T_SIZE >= 3) _mm256_storeu_ps(output_ptr + 2 * C_batch_size, acc2);
        if (T_SIZE >= 4) _mm256_storeu_ps(output_ptr + 3 * C_batch_size, acc3);
        if (T_SIZE >= 5) _mm256_storeu_ps(output_ptr + 4 * C_batch_size, acc4);
        if (T_SIZE >= 6) _mm256_storeu_ps(output_ptr + 5 * C_batch_size, acc5);
        if (T_SIZE >= 7) _mm256_storeu_ps(output_ptr + 6 * C_batch_size, acc6);
        if (T_SIZE >= 8) _mm256_storeu_ps(output_ptr + 7 * C_batch_size, acc7);
        if (T_SIZE >= 9) _mm256_storeu_ps(output_ptr + 8 * C_batch_size, acc8);
        if (T_SIZE >= 10) _mm256_storeu_ps(output_ptr + 9 * C_batch_size, acc9);
        if (T_SIZE >= 11) _mm256_storeu_ps(output_ptr + 10 * C_batch_size, acc10);
        if (T_SIZE >= 12) _mm256_storeu_ps(output_ptr + 11 * C_batch_size, acc11);
        if (T_SIZE >= 13) _mm256_storeu_ps(output_ptr + 12 * C_batch_size, acc12);
        if (T_SIZE >= 14) _mm256_storeu_ps(output_ptr + 13 * C_batch_size, acc13);
        if (T_SIZE >= 15) _mm256_storeu_ps(output_ptr + 14 * C_batch_size, acc14);

        // Sum up accumulators.
        if (T_SIZE >= 1) acc_sum = _mm256_add_ps(acc0, acc_sum);
        if (T_SIZE >= 2) acc_sum = _mm256_add_ps(acc1, acc_sum);
        if (T_SIZE >= 3) acc_sum = _mm256_add_ps(acc2, acc_sum);
        if (T_SIZE >= 4) acc_sum = _mm256_add_ps(acc3, acc_sum);
        if (T_SIZE >= 5) acc_sum = _mm256_add_ps(acc4, acc_sum);
        if (T_SIZE >= 6) acc_sum = _mm256_add_ps(acc5, acc_sum);
        if (T_SIZE >= 7) acc_sum = _mm256_add_ps(acc6, acc_sum);
        if (T_SIZE >= 8) acc_sum = _mm256_add_ps(acc7, acc_sum);
        if (T_SIZE >= 9) acc_sum = _mm256_add_ps(acc8, acc_sum);
        if (T_SIZE >= 10) acc_sum = _mm256_add_ps(acc9, acc_sum);
        if (T_SIZE >= 11) acc_sum = _mm256_add_ps(acc10, acc_sum);
        if (T_SIZE >= 12) acc_sum = _mm256_add_ps(acc11, acc_sum);
        if (T_SIZE >= 13) acc_sum = _mm256_add_ps(acc12, acc_sum);
        if (T_SIZE >= 14) acc_sum = _mm256_add_ps(acc13, acc_sum);
        if (T_SIZE >= 15) acc_sum = _mm256_add_ps(acc14, acc_sum);

        input_ptr += C_batch_size*T_SIZE;
        output_ptr += C_batch_size*T_SIZE;
    }

Exemple #30

0

Afficher le fichier

Fichier : power_cache.c Projet : GeraintPratten/lalsuite

static void inline sse_sum_unaligned_F(int count, float *partial, float *accum)
{
#if MANUAL_SSE
#if USE_AVX
int i;
__m256 v8in, v8acc;

//fprintf(stderr, "unaligned %d %p %p\n", i, partial, accum);
for(i=0; ((long)&(accum[i]) & 0x0f) && (i<count);i++) {
	accum[i]+=partial[i];
	}

PRAGMA_IVDEP
for(;i<(count-7);i+=8) {
	v8in=_mm256_loadu_ps(&(partial[i]));
	v8acc=_mm256_load_ps(&(accum[i]));
	v8acc=_mm256_add_ps(v8acc, v8in);
	_mm256_store_ps(&(accum[i]), v8acc);
	}

PRAGMA_IVDEP
for(;i<count;i++) {
	accum[i]+=partial[i];
	}
#else
int i;
__m128 v4in, v4acc;
float *tmp=aligned_alloca(16*sizeof(*tmp));

for(i=0; ((long)&(accum[i]) & 0x0f) && (i<count);i++) {
	accum[i]+=partial[i];
	}

//fprintf(stderr, "unaligned %d %p %p\n", i, partial, accum);

PRAGMA_IVDEP
for(;i<(count-15);i+=16) {
	memcpy(tmp, &(partial[i]), 16*sizeof(*tmp));

	v4in=_mm_load_ps(tmp);
	v4acc=_mm_load_ps(&(accum[i]));
	v4acc=_mm_add_ps(v4acc, v4in);
	_mm_store_ps(&(accum[i]), v4acc);

	v4in=_mm_load_ps(&(tmp[4]));
	v4acc=_mm_load_ps(&(accum[i+4]));
	v4acc=_mm_add_ps(v4acc, v4in);
	_mm_store_ps(&(accum[i+4]), v4acc);

	v4in=_mm_load_ps(&(tmp[8]));
	v4acc=_mm_load_ps(&(accum[i+8]));
	v4acc=_mm_add_ps(v4acc, v4in);
	_mm_store_ps(&(accum[i+8]), v4acc);

	v4in=_mm_load_ps(&(tmp[12]));
	v4acc=_mm_load_ps(&(accum[i+12]));
	v4acc=_mm_add_ps(v4acc, v4in);
	_mm_store_ps(&(accum[i+12]), v4acc);
	}

PRAGMA_IVDEP
for(;i<count;i++) {
	accum[i]+=partial[i];
	}
#endif
#else
fprintf(stderr, "**** MANUAL_SSE disabled in %s\n", __FUNCTION__);
exit(-2);
#endif
}