Beispiel #1
0
static void
inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
{
  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = 1.0 / *src1++;
  }
  for (; n >= 8; n -= 8) {
    __m128 xmm0, xmm1;
    /* While _mm_rcp_ps sounds promising, the results it gives are rather
     * different from the 1.0 / src1 reference implementation, so do that.
     */
    xmm0 = _mm_set_ps1(1.0);
    xmm1 = _mm_loadu_ps(src1);
    xmm0 = _mm_div_ps(xmm0, xmm1);
    _mm_store_ps(dest, xmm0);
    xmm0 = _mm_set_ps1(1.0);
    xmm1 = _mm_loadu_ps(src1 + 4);
    xmm0 = _mm_div_ps(xmm0, xmm1);
    _mm_store_ps(dest + 4, xmm0);
    dest += 8;
    src1 += 8;
  }
  for (; n > 0; n--) {
    *dest++ = 1.0 / *src1++;
  }
}
Beispiel #2
0
static void
divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
{
  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = *src1++ / *src2++;
  }
  for (; n >= 8; n -= 8) {
    __m128 xmm0, xmm1;
    xmm0 = _mm_loadu_ps(src1);
    xmm1 = _mm_loadu_ps(src2);
    xmm0 = _mm_div_ps(xmm0, xmm1);
    _mm_store_ps(dest, xmm0);
    xmm0 = _mm_loadu_ps(src1 + 4);
    xmm1 = _mm_loadu_ps(src2 + 4);
    xmm0 = _mm_div_ps(xmm0, xmm1);
    _mm_store_ps(dest + 4, xmm0);
    dest += 8;
    src1 += 8;
    src2 += 8;
  }
  for (; n > 0; n--) {
    *dest++ = *src1++ / *src2++;
  }
}
Beispiel #3
0
static inline __m128
lanczos_sse(__m128 width, __m128 t)
{
    /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
     * correct */
    __m128i a = _mm_cvtps_epi32(t);
    __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));

    // Compute the correct sign for sinf(pi.r)
    static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT))) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000};
    static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1};
    static const __m128 eps = {DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON};
    static const __m128 pi = {M_PI, M_PI, M_PI, M_PI};
    static const __m128 pi2 = {M_PI*M_PI, M_PI*M_PI, M_PI*M_PI, M_PI*M_PI};

    __m128i isign = _mm_and_si128(*(__m128i*)ione, a);
    isign = _mm_slli_epi64(isign, 31);
    isign = _mm_or_si128(*(__m128i*)fone, isign);
    __m128 fsign = _mm_castsi128_ps(isign);

    __m128 num = _mm_mul_ps(width, fsign);
    num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
    num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
    num = _mm_add_ps(eps, num);

    __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
    den = _mm_add_ps(eps, den);

    return _mm_div_ps(num, den);
}
	static inline Simd div(const Simd& lhs, float rhs) {
		Simd res;
		__m128 tmp = _mm_set1_ps(rhs);
		res.reg[0] = _mm_div_ps(lhs.reg[0], tmp);
		res.reg[1] = _mm_div_ps(lhs.reg[1], tmp);
		return res;
	}
Beispiel #5
0
int main()
{	

	float m=1.0; /* initial  magnification		*/

	/* Timing variables */
	struct timeval start_time, stop_time;
	long long compute_time;

	/* 			*/
	/* Create a screen to render to */
	Screen *screen;
	screen = new Screen(HXRES, HYRES);
	gettimeofday(&start_time, NULL);
	//Sets up the parallel stuff
	int depth=0;
	while (depth < MAX_DEPTH) {
		
			#pragma omp parallel
			{
			float * answers = (float *)malloc(sizeof(float) * 4);
			#pragma omp for schedule(dynamic)
			for (int hy=0; hy<HYRES; hy++) {
				float cy = ((((float)hy/(float)HYRES) -0.5 + (PY/(4.0/m)))*(4.0f/m));
				__m128 cy_m = _mm_set1_ps(cy);
				__m128 four_m = _mm_set1_ps((4.0/m));
				for (int hx=0; hx<HXRES; hx+=4) {
					__m128 cx_m = _mm_setr_ps(hx, hx+1, hx+2, hx+3);
					cx_m = _mm_div_ps(cx_m, _mm_set1_ps(HXRES));
					cx_m = _mm_sub_ps(cx_m, _mm_set1_ps(0.5));
					cx_m = _mm_add_ps(cx_m, _mm_div_ps(_mm_set1_ps(PX),four_m));
					cx_m = _mm_mul_ps(cx_m, four_m);
					//Store and check the four iterations and update the screen accordingly !
					_mm_storeu_ps(answers, member_speed(cx_m, cy_m));
					for (int k = 0; k < 4; k++) {
						if (answers[k] != MAX_ITS) {
							int l=((int)(answers[k]) % 40) - 1;
							l = l*3;
							screen->putpixel(hx+k, hy, pal[l], pal[l+1], pal[l+2]);
						}else{
							screen->putpixel(hx+k, hy, 0, 0, 0);
						}
					}
				}
			}
			}
			screen->flip();
			/* Show the rendered image on the screen */
			std::cerr << "Render done " << depth++ << " " << m << std::endl;
			/* Zoom in */
			m *= ZOOM_FACTOR;
	}
	gettimeofday(&stop_time, NULL);
	compute_time = (stop_time.tv_sec - start_time.tv_sec) * 1000000L +
		(stop_time.tv_usec - start_time.tv_usec);
	fprintf(stderr, "Time to find Richys tour: %lld microseconds\n", compute_time);
	sleep(5);
	std::cout << "Clean Exit"<< std::endl;

}
Beispiel #6
0
void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count)
{
  // See "Note about PT-based distortion models" at the top of mod-coord.cpp.
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_Dist_PTLens(data, iocoord, count);
  }

  lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data;

  // Rd = Ru * (a_ * Ru^3 + b_ * Ru^2 + c_ * Ru + 1)
  __m128 a_ = _mm_set_ps1 (cddata->Terms [0]);
  __m128 b_ = _mm_set_ps1 (cddata->Terms [1]);
  __m128 c_ = _mm_set_ps1 (cddata->Terms [2]);
  __m128 cx = _mm_set_ps1 (cddata->centerX);
  __m128 cy = _mm_set_ps1 (cddata->centerY);
  __m128 cc = _mm_set_ps1 (cddata->coordinate_correction);
  __m128 one = _mm_set_ps1 (1.0f);

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    __m128 c0 = _mm_load_ps (&iocoord [8 * i]);
    __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));
    x = _mm_sub_ps(_mm_mul_ps(x, cc), cx);
    y = _mm_sub_ps(_mm_mul_ps(y, cc), cy);

    __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y));
    __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2));

    // Calculate poly3 = a_ * ru2 * ru + b_ * ru2 + c_ * ru + 1;
    __m128 t = _mm_mul_ps (ru2, b_);
    __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a_, ru2), ru);
    t = _mm_add_ps (t, _mm_mul_ps (ru, c_));
    poly3 = _mm_add_ps (t, _mm_add_ps (poly3, one));

    x = _mm_add_ps(_mm_mul_ps (x, poly3), cx);
    y = _mm_add_ps(_mm_mul_ps (y, poly3), cy);
    x = _mm_div_ps (x, cc);
    y = _mm_div_ps (y, cc);

    c0 = _mm_unpacklo_ps(x, y);
    c1 = _mm_unpackhi_ps(x, y);

    _mm_store_ps (&iocoord [8 * i], c0);
    _mm_store_ps (&iocoord [8 * i + 4], c1);
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain)
    ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain);
}
Beispiel #7
0
kw_mat4 kw_div(kw_mat4 m, f32 scale){
    kw_mat4 result;
    result.simd[0] = _mm_div_ps(m.simd[0], _mm_set_ps1(scale));
    result.simd[1] = _mm_div_ps(m.simd[1], _mm_set_ps1(scale));
    result.simd[2] = _mm_div_ps(m.simd[2], _mm_set_ps1(scale));
    result.simd[3] = _mm_div_ps(m.simd[3], _mm_set_ps1(scale));
    return result;
}
static void SubbandCoherenceSSE2(AecCore* aec,
                                 float efw[2][PART_LEN1],
                                 float dfw[2][PART_LEN1],
                                 float xfw[2][PART_LEN1],
                                 float* fft,
                                 float* cohde,
                                 float* cohxd,
                                 int* extreme_filter_divergence) {
  int i;

  SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence);

  {
    const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f);

    // Subband coherence
    for (i = 0; i + 3 < PART_LEN1; i += 4) {
      const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
      const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
      const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
      const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
                                         _mm_mul_ps(vec_sd, vec_se));
      const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
                                         _mm_mul_ps(vec_sd, vec_sx));
      const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
      const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
      const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
      const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
      const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
                                              _MM_SHUFFLE(2, 0, 2, 0));
      const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
                                              _MM_SHUFFLE(3, 1, 3, 1));
      const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
                                              _MM_SHUFFLE(2, 0, 2, 0));
      const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
                                              _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
      __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
      vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
      vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
      vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
      vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
      _mm_storeu_ps(&cohde[i], vec_cohde);
      _mm_storeu_ps(&cohxd[i], vec_cohxd);
    }

    // scalar code for the remaining items.
    for (; i < PART_LEN1; i++) {
      cohde[i] =
          (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
          (aec->sd[i] * aec->se[i] + 1e-10f);
      cohxd[i] =
          (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
          (aec->sx[i] * aec->sd[i] + 1e-10f);
    }
  }
}
Beispiel #9
0
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev)
{
	/* 
	CODE ADAPTED FROM boost/math/normal.hpp 

	RealType exponent = x - mean;
	exponent *= -exponent;
	exponent /= 2 * sd * sd;

	result = exp(exponent);
	result /= sd * sqrt(2 * constants::pi<RealType>());

	return result;
	*/
	const __m128& partitions = *(__m128*)_partitions;
	__m128 exponent, tmp, mean, sd;

	/* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */
	static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

	static const __m128 twos = _mm_set_ps1(2.0f);
	static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI));

	// store mean and sd:
	mean = _mm_load_ps1(&_mean);
	sd = _mm_load_ps1(&_stdDev);

	// exponent = x - mean
	exponent = _mm_sub_ps(partitions, mean);
	// exponent *= -exponent;
	tmp = _mm_xor_ps(exponent, signmask);
	exponent = _mm_mul_ps(exponent, tmp);
	// exponent /= 2 * sd * sd;
	tmp = _mm_mul_ps(sd, sd);
	tmp = _mm_mul_ps(tmp, twos);
	exponent = _mm_div_ps(exponent, tmp);
	// exponent = exp(exponent);
	exponent = _mm_exp_ps(exponent);
	// exponent /= sd * sqrt(2 * pi)
	tmp = _mm_mul_ps(sd, sqrt_pi_2_s);
	tmp = _mm_div_ps(exponent, tmp);


#ifndef NDEBUG
	const float* _result = (float*)&tmp;
	boost::math::normal_distribution<float> cNormal(_mean, _stdDev);
	assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f);
	assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f);
	assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f);
	assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f);
#endif

	// return result:
	_mm_store_ps(result, tmp);
};
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
{
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
  const __m128 kMu = _mm_set1_ps(aec->mu);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < (PART_LEN1); i++) {
    float absEf;
    ef[0][i] /= (aec->xPow[i] + 1e-10f);
    ef[1][i] /= (aec->xPow[i] + 1e-10f);
    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (absEf > aec->errThresh) {
      absEf = aec->errThresh / (absEf + 1e-10f);
      ef[0][i] *= absEf;
      ef[1][i] *= absEf;
    }

    // Stepsize factor
    ef[0][i] *= aec->mu;
    ef[1][i] *= aec->mu;
  }
}
Beispiel #11
0
	inline float16 operator / (const float16 & a, const float16 & b)
	{
		float16 res;
			
		res.x.m128 = _mm_div_ps(a.x.m128, b.x.m128);
		res.y.m128 = _mm_div_ps(a.y.m128, b.y.m128);
		res.z.m128 = _mm_div_ps(a.z.m128, b.z.m128);
		res.w.m128 = _mm_div_ps(a.w.m128, b.w.m128);
			
		return res;
	}
Beispiel #12
0
static inline __m128 curve_vec4(
    const __m128 x,
    const __m128 g,
    const __m128 sigma,
    const __m128 shadows,
    const __m128 highlights,
    const __m128 clarity)
{
  // TODO: pull these non-data depedent constants out of the loop to see
  // whether the compiler fail to do so
  const __m128 const0 = _mm_set_ps1(0x3f800000u);
  const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x
  const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
  const __m128 one = _mm_set1_ps(1.0f);
  const __m128 two = _mm_set1_ps(2.0f);
  const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
  const __m128 twosig = _mm_mul_ps(two, sigma);
  const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
  const __m128 s22 = _mm_mul_ps(twothirds, sigma2);

  const __m128 c = _mm_sub_ps(x, g);
  const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
  // select shadows or highlights as multiplier for linear part, based on c < 0
  const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
  // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
  const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
  // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
  const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));

  const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
        _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
  const __m128 t2 = _mm_mul_ps(t, t);
  const __m128 mt = _mm_sub_ps(one, t);

  // midtone value fading over to linear part, without local contrast:
  const __m128 vmid = _mm_add_ps(g,
      _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
        _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));

  // c > 2*sigma?
  const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
  const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));

  // midtone local contrast
  // dt_fast_expf in sse:
  const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
  const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
  const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
  const __m128i ki = _mm_cvtps_epi32(k);
  const __m128 gauss = _mm_load_ps((float*)&ki);
  const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
  return _mm_add_ps(val, vcon);
}
Beispiel #13
0
void
process(
  struct dt_iop_module_t *self,
  dt_dev_pixelpipe_iop_t *piece,
  const void *const ivoid,
  void *ovoid,
  const dt_iop_roi_t *const roi_in,
  const dt_iop_roi_t *const roi_out)
{
  const  float divider  = (float)UINT16_MAX;
  const __m128 dividers = _mm_set_ps1(divider);

#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width;
    float *out = ((float *)ovoid) + (size_t)j * roi_out->width;

    int i = 0;
    int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1));

    // process unaligned pixels
    for ( ; i < alignment ; i++, out++, in++)
      *out = ((float)(*in)) / divider;

    // process aligned pixels with SSE
    for( ; i < roi_out->width - (8 - 1); i += 8, in += 8)
    {
      const __m128i input = _mm_load_si128((__m128i *)in);

      __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0));
      __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0));

      __m128 flo = _mm_cvtepi32_ps(ilo);
      __m128 fhi = _mm_cvtepi32_ps(ihi);

      flo = _mm_div_ps(flo, dividers);
      fhi = _mm_div_ps(fhi, dividers);

      _mm_stream_ps(out, flo);
      out += 4;
      _mm_stream_ps(out, fhi);
      out += 4;
    }

    // process the rest
    for( ; i < roi_out->width; i++, out++, in++)
      *out = ((float)(*in)) / divider;
  }
  _mm_sfence();
}
static void spline_n_4(int i, float t, float *knot, float *splineVal) {
	knot += i + 1;

#ifdef _M_SSE
	const __m128 knot012 = _mm_loadu_ps(&knot[0]);
	const __m128 knot345 = _mm_loadu_ps(&knot[3]);
	const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
	const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012));

	const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0));
	const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1));
	const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1));
	const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122));

	// It's still faster to use SSE, even with this.
	float MEMORY_ALIGNED16(ff30_41_52[4]);
	float MEMORY_ALIGNED16(ff31_42_32[4]);
	_mm_store_ps(ff30_41_52, f30_41_52);
	_mm_store_ps(ff31_42_32, f31_42_32);

	const float &f30 = ff30_41_52[0];
	const float &f41 = ff30_41_52[1];
	const float &f52 = ff30_41_52[2];
	const float &f31 = ff31_42_32[0];
	const float &f42 = ff31_42_32[1];
	const float &f32 = ff31_42_32[2];
#else
	// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
	float t0 = (t - knot[0]);
	float t1 = (t - knot[1]);
	float t2 = (t - knot[2]);
	// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
	float f30 = t0/(knot[3]-knot[0]);
	float f41 = t1/(knot[4]-knot[1]);
	float f52 = t2/(knot[5]-knot[2]);
	float f31 = t1/(knot[3]-knot[1]);
	float f42 = t2/(knot[4]-knot[2]);
	float f32 = t2/(knot[3]-knot[2]);
#endif

	float a = (1-f30)*(1-f31);
	float b = (f31*f41);
	float c = (1-f41)*(1-f42);
	float d = (f42*f52);

	splineVal[0] = a-(a*f32);
	splineVal[1] = 1-a-b+((a+b+c-1)*f32);
	splineVal[2] = b+((1-b-c-d)*f32);
	splineVal[3] = d*f32;
}
Beispiel #15
0
/* use compiler intrinsics for 4x parallel processing */
static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) {
    float result=0;
    const __m128 eps = _mm_set1_ps(FLT_MIN);
    const __m128 zero = _mm_setzero_ps();
    __m128 chi2 = _mm_setzero_ps();
    
    for (; n>3; n-=4) {
        const __m128 a = _mm_loadu_ps(x);
        const __m128 b = _mm_loadu_ps(y);
        const __m128 a_plus_eps = _mm_add_ps(a,eps);
        const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b);
        const __m128 a_minus_b = _mm_sub_ps(a,b);
        const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b);
        const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_ps(chi2, prod);
	x+=4;
	y+=4;
    }
    const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2));
    const __m128 sum1 = _mm_add_ps(chi2, shuffle1);
    const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1));
    const __m128 sum2 = _mm_add_ps(sum1, shuffle2);
// with SSE3, we could use hadd_ps, but the difference is negligible 

    _mm_store_ss(&result,sum2);
    _mm_empty();
    
    if (n)
        result += chi2_baseline_float(n, x, y);	// remaining 1-3 entries
    return result;
}
Beispiel #16
0
RETf DIV(const __m128 x, const __m128 y) {
#ifdef __ARM_NEON__ // NEON doesn't seem to support this
    return x / y;
#else
    return _mm_div_ps(x, y);
#endif
}
void warmup_vector(float* x, float* y, int size, float* alpha){
    int i;
    __m128 RX_2, RX_2i, RY, R_t1, R_t2, R_alpha;
    int x_2;
    int x_2i;
    R_alpha = _mm_load_ps(&alpha[0]);
    for(i=0; i<size; i+=4){
        //load our various x values
        RX_2 = _mm_load_ps(&x[i*2]);   
        RX_2i = _mm_load_ps(&x[2*i+1]);
        //perform x[2*i+1]/alpha -> store in R_t1
        R_t1 = _mm_div_ps(RX_2i, R_alpha);
        //multiply x[2*i] by x[2*i] -> store in R_t2
        R_t2= _mm_mul_ps(RX_2, RX_2);
        //add all our variables together; this will be stored as our new y[i] values (in RY)
        //issue resulting in seg fault seems to reside in below code.... not sure what's going on
        //tried increasing the size of things in our array to see if we were getting some negative values
        //something able to set to handle negatives?
        //  RY = _mm_add_ps(R_t1, R_t2);
        RY = _mm_add_ps(R_t1, R_t2);
        // copy everything to y[i]
        _mm_store_ps(&y[i], RY);     
    }

}
Beispiel #18
0
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    accIx = _mm_add_ps(accIx, aix);
    accIy = _mm_add_ps(accIy, aiy);
    accIz = _mm_add_ps(accIz, aiz);

    _mm_storer_ps(accI, accIx);
    _mm_storer_ps(accI + 4, accIy);
    _mm_storer_ps(accI + 8, accIz);
}
Beispiel #19
0
/** returns a kernel averaged from the two kernels between them, sse
 * version
 *
 * @param kernel the return pointer for the kernel
 * @param amp the amp which holds the kernels
 */
void average_kernels_sse(float *kernel, Amp * amp)
{
    int i;
	__m128 right;
	__m128 left;
	__m128 out;
	__m128 leftindex = {1.0f, 2.0f, 3.0f, 4.0f};
	__m128 rightindex = { FOURIER_SIZE - 1.0, FOURIER_SIZE - 2.0, FOURIER_SIZE - 3.0, FOURIER_SIZE - 4.0};
	__m128 fouriersize = _mm_set_ps1((float)FOURIER_SIZE);
	__m128 jumpsize = _mm_set_ps1(4.0f);
    for (i = 0; i < FOURIER_SIZE; i=i+4) {
		left = _mm_loadu_ps(amp->previous_buffer + i);
		right = _mm_loadu_ps(amp->fourier_buffer + i);

		out = _mm_div_ps( _mm_add_ps( _mm_mul_ps(left, leftindex), _mm_mul_ps(right, rightindex)), fouriersize);

		leftindex = _mm_add_ps(leftindex, jumpsize);
		rightindex = _mm_sub_ps(rightindex, jumpsize);

		_mm_store_ps(kernel + i, out);
	}
	for (; i < FOURIER_SIZE; i++) {
        kernel[i] =
            (amp->previous_buffer[i] * (i + 1) +
             amp->fourier_buffer[i] * (FOURIER_SIZE - i -
                                       1)) / (FOURIER_SIZE);
    }
}
Beispiel #20
0
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    for (int i = 0; i < 4; i++) {
        accI[3 - i].x = aix.m128_f32[i];
        accI[3 - i].y = aiy.m128_f32[i];
        accI[3 - i].z = aiz.m128_f32[i];
    }

}
Beispiel #21
0
void calculateSSE(int start, int end)
{
    int size = end - start + 1;

    // we use aligned memory, because SSE instructions are really slow
    // working on unaligned memory
    float* result = (float*)aligned_alloc(16, size * sizeof(float));

    __m128 x;
    __m128 delta_x = _mm_set_ps1(4.0f);
    __m128 y = _mm_set_ps1(1.0f);
    __m128* sse_result = (__m128*)result;

    const int sse_length = size / 4;
    x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);
    for (int loop = 0; loop < 100000; ++loop)
    {
        for (int i = 0; i < sse_length; ++i)
        {
            __m128 sqrt_result = _mm_sqrt_ps(x);
            sse_result[i] = _mm_div_ps(sqrt_result, x);
            //sse_result[i] = _mm_add_ps(x, y);

            // move x value to next 4 numbers
            x = _mm_add_ps(x, delta_x);
        }        
    }
}
Beispiel #22
0
static inline void boxVerticesToScreenVertices(vec4f vertices[8],const vec4f& screenCenterMul,vec2i screenCenter){
#ifdef ARPHEG_ARCH_X86
	__m128 screenSpaceMul = _mm_load_ps((float*)&screenCenterMul.x);
	__m128 screenCenterOffset = _mm_setr_ps(float(screenCenter.x),float(screenCenter.y),0,0);
	__m128 nearClip = _mm_setzero_ps();
	for(uint32 i = 0;i<8;++i){
		__m128 hv = _mm_load_ps((float*)(vertices + i));
		
		__m128 w  = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(3,3,3,3)); //get the w component
		__m128 z  = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(2,2,2,2));
		hv = _mm_div_ps(hv,w); //Project XYZW to clip space (divide by w)
		
		hv = _mm_mul_ps(hv,screenSpaceMul); //XY to screen space    [-width/2,-height/2 -> width/2,height/2]
		hv = _mm_add_ps(hv,screenCenterOffset);//XY to screen space [0,0 -> width,height]
		__m128 mNoNearClip = _mm_cmpge_ps(z, nearClip );

		//Set to all-0 if near-clipped
		hv = _mm_and_ps(hv, mNoNearClip);

		_mm_store_ps((float*)(vertices + i),hv);
	}
#else
	//TODO
	ScreenSpaceVertex* screenVerts= (ScreenSpaceVertex*)vertices;
	for(uint32 i =0;i<8;++i){
		vertices[i] = vertices[i] * (1.0f/vertices[i].w) ;
		auto v = vertices[i] * screenCenterMul;
		screenVerts[i].pos = vec2i(int32(v.x),int32(v.y))+screenCenter;
	}
#endif
}
Beispiel #23
0
static inline __m128 sigmoid_positive_ps( __m128 xin )
{
	union {
		__m128i i;
		int32_t i32[4];
	} i;
	__m128 ex;
	float *ex_elem = (float*) &ex;
	__m128 x1 = _mm_min_ps( xin, tens.ps );

	x1 = _mm_mul_ps( x1, tens.ps );
	i.i  = _mm_cvttps_epi32( x1 );
 	ex_elem[0] = e[i.i32[0]];
	ex_elem[1] = e[i.i32[1]];
	ex_elem[2] = e[i.i32[2]];
	ex_elem[3] = e[i.i32[3]];
	x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); 
	x1 = _mm_add_ps( x1, tens.ps );
	x1 = _mm_mul_ps( x1, ex );
	x1 = _mm_add_ps( x1, ones.ps ); 
#ifdef __FAST_MATH__
	return _mm_rcp_ps( x1 );
#else
	return _mm_div_ps( ones.ps, x1 );
#endif
}
Beispiel #24
0
	inline float4 operator / (const float4 & a, const float4 & b)
	{
		float4 res;

		res.m128 = _mm_div_ps(a.m128, b.m128);

		return res;
	}
Beispiel #25
0
	inline void operator()(const IrradianceSample &sample) {
		/* Distance to the positive point source of the dipole */
		const __m128 lengthSquared = _mm_set1_ps((p - sample.p).lengthSquared()),
			drSqr = _mm_add_ps(zrSqr, lengthSquared), 
			dvSqr = _mm_add_ps(zvSqr, lengthSquared),
			dr = _mm_sqrt_ps(drSqr), dv = _mm_sqrt_ps(dvSqr), 
			one = _mm_set1_ps(1.0f),
			factor = _mm_mul_ps(_mm_set1_ps(0.25f*INV_PI*sample.area * Fdt),
				_mm_set_ps(sample.E[0], sample.E[1], sample.E[2], 0)),
			C1fac = _mm_div_ps(_mm_mul_ps(zr, _mm_add_ps(sigmaTr, _mm_div_ps(one, dr))), drSqr),
			C2fac = _mm_div_ps(_mm_mul_ps(zv, _mm_add_ps(sigmaTr, _mm_div_ps(one, dv))), dvSqr);
		SSEVector temp1(_mm_mul_ps(dr, sigmaTr)), temp2(_mm_mul_ps(dv, sigmaTr));
		const __m128
			exp1 = _mm_set_ps(expf(-temp1.f[3]), expf(-temp1.f[2]), expf(-temp1.f[1]), 0),
			exp2 = _mm_set_ps(expf(-temp2.f[3]), expf(-temp2.f[2]), expf(-temp2.f[1]), 0);
		result.ps = _mm_add_ps(result.ps, _mm_mul_ps(factor, _mm_add_ps(
			_mm_mul_ps(C1fac, exp1), _mm_mul_ps(C2fac, exp2))));
	}
Beispiel #26
0
 static void vectorDiv(float* a, float* b, float* c, size_t n) {
         __m128 A, B, C;
         for(size_t i = 0; i < n; i += 4) {
                 A = _mm_load_ps(&a[i]);
                 B = _mm_load_ps(&b[i]);
                 C = _mm_div_ps(A, B);
                 _mm_store_ps(&c[i], C);
         }
 }
Beispiel #27
0
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f);
	const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f);
	const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f);

	const __m128 _ps_pi    = _mm_set1_ps(pi);
	const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128 mm1, mm2, mm3;
	__m128 axm, aym;

	__m128 xm = _mm_set1_ps(x);
	__m128 ym = _mm_set1_ps(y);

	axm = _mm_and_ps(xm, _mask_sign_inv);
	aym = _mm_and_ps(ym, _mask_sign_inv);

	mm1 = _mm_min_ps(axm, aym);
	mm2 = _mm_max_ps(axm, aym);
	mm1 = _mm_div_ps(mm1, mm2);
	mm2 = _mm_mul_ps(mm1, mm1);

	mm3 = _mm_mul_ps(mm2, _ps_atan_p0);
	mm3 = _mm_add_ps(mm3, _ps_atan_p1);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_sub_ps(mm3, _ps_atan_p2);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_mul_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	__m128 mask;

	/* |y| > |x| */
	mask = _mm_cmpgt_ss(aym, axm);
	mm2 = _mm_and_ps(_ps_pi0p5, mask);
	mm1 = _mm_and_ps(_mask_sign_raw, mask);
	mm3 = _mm_xor_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm2);

	/* x < 0 */
	mask = _mm_and_ps(xm, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mask);
	mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30));
	mm1 = _mm_and_ps(_ps_pi, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	/* y < 0 */
	mm1 = _mm_and_ps(ym, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mm1);

	return _mm_cvtss_f32(mm3);
}
    SIMDValue SIMDFloat32x4Operation::OpDiv(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128_value = _mm_div_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a / b

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Beispiel #29
0
/*
  V_SgDivideOp
*/
__SIMD _SIMD_div_ps(__SIMD a, __SIMD b)
{
#ifdef  USE_SSE
  return _mm_div_ps(a,b);
#elif defined USE_AVX
  return _m256_div_ps(a,b);
#elif defined USE_IBM
  return vec_div(a,b);
#endif
}
    SIMDValue SIMDFloat32x4Operation::OpReciprocalSqrt(const SIMDValue& value)
    {
        X86SIMDValue x86Result;
        X86SIMDValue temp;
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        temp.m128_value = _mm_div_ps(X86_ALL_ONES_F4.m128_value, v.m128_value); // temp = 1.0/value
        x86Result.m128_value = _mm_sqrt_ps(temp.m128_value);              // result = sqrt(1.0/value)

        return X86SIMDValue::ToSIMDValue(x86Result);
    }