Example #1
0
__m128 PowSSE_FixedPoint_Exponent(__m128 x, int exponent)
{
	__m128 rslt=Four_Ones;									// x^0=1.0
	int xp=abs(exponent);
	if (xp & 3)												// fraction present?
	{
		__m128 sq_rt=_mm_sqrt_ps(x);
		if (xp & 1)											// .25?
			rslt=_mm_sqrt_ps(sq_rt);						// x^.25
		if (xp & 2)
			rslt=_mm_mul_ps(rslt,sq_rt);
	}
	xp>>=2;													// strip fraction
	__m128 curpower=x;										// curpower iterates through  x,x^2,x^4,x^8,x^16...

	while(1)
	{
		if (xp & 1)
			rslt=_mm_mul_ps(rslt,curpower);
		xp>>=1;
		if (xp)
			curpower=_mm_mul_ps(curpower,curpower);
		else
			break;
	}
	if (exponent<0)
		return _mm_rcp_ps(rslt);							// pow(x,-b)=1/pow(x,b)
	else
		return rslt;
}
Example #2
0
__m128
t4(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_xor_ps (a,b);
}
Example #3
0
__m128
t2(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_andnot_ps (a,b);
}
Example #4
0
void
init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum)
{
    int     i;
    float   tmp_max = 0;
    float   tmp_sum = 0;
    int     upper4 = (upper / 4) * 4;
    int     rest = upper-upper4;

    const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }};
    const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]);
    vecfloat_union vec_xrpow_max;
    vecfloat_union vec_sum;
    vecfloat_union vec_tmp;

    _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0);
    _mm_prefetch((char *) xrpow, _MM_HINT_T0);

    vec_xrpow_max._m128 = _mm_set_ps1(0);
    vec_sum._m128 = _mm_set_ps1(0);

    for (i = 0; i < upper4; i += 4) {
        vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */
        vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
        vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
        vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
        vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
        _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */
    }
    vec_tmp._m128 = _mm_set_ps1(0);
    switch (rest) {
        case 3: vec_tmp._float[2] = cod_info->xr[upper4+2];
        case 2: vec_tmp._float[1] = cod_info->xr[upper4+1];
        case 1: vec_tmp._float[0] = cod_info->xr[upper4+0];
            vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
            vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
            vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
            vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
            switch (rest) {
                case 3: xrpow[upper4+2] = vec_tmp._float[2];
                case 2: xrpow[upper4+1] = vec_tmp._float[1];
                case 1: xrpow[upper4+0] = vec_tmp._float[0];
                default:
                    break;
            }
        default:
            break;
    }
    tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3];
    {
        float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1]
                ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1];
        float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3]
                ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3];
        tmp_max = ma > mb ? ma : mb;
    }
    cod_info->xrpow_max = tmp_max;
    *sum = tmp_sum;
}
Example #5
0
void calculateSSE(int start, int end)
{
    int size = end - start + 1;

    // we use aligned memory, because SSE instructions are really slow
    // working on unaligned memory
    float* result = (float*)aligned_alloc(16, size * sizeof(float));

    __m128 x;
    __m128 delta_x = _mm_set_ps1(4.0f);
    __m128 y = _mm_set_ps1(1.0f);
    __m128* sse_result = (__m128*)result;

    const int sse_length = size / 4;
    x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f);
    for (int loop = 0; loop < 100000; ++loop)
    {
        for (int i = 0; i < sse_length; ++i)
        {
            __m128 sqrt_result = _mm_sqrt_ps(x);
            sse_result[i] = _mm_div_ps(sqrt_result, x);
            //sse_result[i] = _mm_add_ps(x, y);

            // move x value to next 4 numbers
            x = _mm_add_ps(x, delta_x);
        }        
    }
}
inline void Sphere::intersect_sse(__m128 &rox, __m128 &roy, __m128 &roz,__m128 &rdx, __m128 &rdy, __m128 &rdz, __m128 &t, __m128 &hit_res){
    __m128 dist_x = _mm_sub_ps(_mm_set1_ps(c.x), rox);
    __m128 dist_y = _mm_sub_ps(_mm_set1_ps(c.y), roy);
    __m128 dist_z = _mm_sub_ps(_mm_set1_ps(c.z), roz);
    
    __m128 B = dot_sse(rdx,rdy,rdz, dist_x, dist_y, dist_z);
    
	__m128 D = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(B,B), dot_sse(dist_x, dist_y, dist_z, dist_x, dist_y, dist_z)), _mm_set_ps1(r * r));  
    
    __m128 sq = _mm_sqrt_ps(D);
    
    __m128 t0 = _mm_sub_ps(B, sq);
    __m128 t1 = _mm_add_ps(B, sq);
    
    hit_res = _mm_set1_epi32(MISS);
    
    for (int i=0;i<4;++i){
        if ((t0[i] > 0.1f) && (t0[i] < t[i])){
            t[i] = t0[i];
            hit_res[i] = HIT;
        }
    }
    
    for (int i=0;i<4;++i){
        if ((t1[i] > 0.1f) && (t1[i] < t[i])){
            t[i] = t1[i];
            hit_res[i] = HIT;
        }
    }
}
Example #7
0
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    accIx = _mm_add_ps(accIx, aix);
    accIy = _mm_add_ps(accIy, aiy);
    accIz = _mm_add_ps(accIz, aiz);

    _mm_storer_ps(accI, accIx);
    _mm_storer_ps(accI + 4, accIy);
    _mm_storer_ps(accI + 8, accIz);
}
Example #8
0
float vec3::length() const {
    __m128 temp = _mm_mul_ps(v, v);
    __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFD);
    temp2 = _mm_add_ps(temp, temp2);
    temp2 = _mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFE));
    return _mm_cvtss_f32(_mm_sqrt_ps(temp2));
}
Example #9
0
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    for (int i = 0; i < 4; i++) {
        accI[3 - i].x = aix.m128_f32[i];
        accI[3 - i].y = aiy.m128_f32[i];
        accI[3 - i].z = aiz.m128_f32[i];
    }

}
Example #10
0
void
mandel_sse2(unsigned char *image, const struct spec *s)
{
    __m128 xmin = _mm_set_ps1(s->xlim[0]);
    __m128 ymin = _mm_set_ps1(s->ylim[0]);
    __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width);
    __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height);
    __m128 threshold = _mm_set_ps1(4);
    __m128 one = _mm_set_ps1(1);
    __m128i zero = _mm_setzero_si128();
    __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations);
    __m128 depth_scale = _mm_set_ps1(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 4) {
            __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0);
            __m128 my = _mm_set_ps1(y);
            __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin);
            __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin);
            __m128 zr = cr;
            __m128 zi = ci;
            int k = 1;
            __m128 mk = _mm_set_ps1(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m128 zr2 = _mm_mul_ps(zr, zr);
                __m128 zi2 = _mm_mul_ps(zi, zi);
                __m128 zrzi = _mm_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr);
                zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm_mul_ps(zr, zr);
                zi2 = _mm_mul_ps(zi, zi);
                __m128 mag2 = _mm_add_ps(zr2, zi2);
                __m128 mask = _mm_cmplt_ps(mag2, threshold);
                mk = _mm_add_ps(_mm_and_ps(mask, one), mk);

                /* Early bailout? */
                __m128i maski = _mm_castps_si128(mask);
                if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero)))
                    break;
            }
            mk = _mm_mul_ps(mk, iter_scale);
            mk = _mm_sqrt_ps(mk);
            mk = _mm_mul_ps(mk, depth_scale);
            __m128i pixels = _mm_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 4; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
Example #11
0
	inline void operator()(const IrradianceSample &sample) {
		/* Distance to the positive point source of the dipole */
		const __m128 lengthSquared = _mm_set1_ps((p - sample.p).lengthSquared()),
			drSqr = _mm_add_ps(zrSqr, lengthSquared), 
			dvSqr = _mm_add_ps(zvSqr, lengthSquared),
			dr = _mm_sqrt_ps(drSqr), dv = _mm_sqrt_ps(dvSqr), 
			one = _mm_set1_ps(1.0f),
			factor = _mm_mul_ps(_mm_set1_ps(0.25f*INV_PI*sample.area * Fdt),
				_mm_set_ps(sample.E[0], sample.E[1], sample.E[2], 0)),
			C1fac = _mm_div_ps(_mm_mul_ps(zr, _mm_add_ps(sigmaTr, _mm_div_ps(one, dr))), drSqr),
			C2fac = _mm_div_ps(_mm_mul_ps(zv, _mm_add_ps(sigmaTr, _mm_div_ps(one, dv))), dvSqr);
		SSEVector temp1(_mm_mul_ps(dr, sigmaTr)), temp2(_mm_mul_ps(dv, sigmaTr));
		const __m128
			exp1 = _mm_set_ps(expf(-temp1.f[3]), expf(-temp1.f[2]), expf(-temp1.f[1]), 0),
			exp2 = _mm_set_ps(expf(-temp2.f[3]), expf(-temp2.f[2]), expf(-temp2.f[1]), 0);
		result.ps = _mm_add_ps(result.ps, _mm_mul_ps(factor, _mm_add_ps(
			_mm_mul_ps(C1fac, exp1), _mm_mul_ps(C2fac, exp2))));
	}
Example #12
0
/*
  Sqrt
*/
__SIMD _SIMD_sqrt_ps(__SIMD a)
{
#ifdef  USE_SSE
  return _mm_sqrt_ps(a);
#elif defined USE_AVX
  return _mm256_sqrt_ps(a);
#elif defined USE_IBM
  return vec_sqrt(a);
#endif
}
    SIMDValue SIMDFloat32x4Operation::OpSqrt(const SIMDValue& value)
    {
        X86SIMDValue x86Result;

        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        x86Result.m128_value = _mm_sqrt_ps(v.m128_value);   // result = sqrt(value)

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1])
{
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kThresh = _mm_set1_ps(aec->errThresh);
  const __m128 kMu = _mm_set1_ps(aec->mu);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  for (; i < (PART_LEN1); i++) {
    float absEf;
    ef[0][i] /= (aec->xPow[i] + 1e-10f);
    ef[1][i] /= (aec->xPow[i] + 1e-10f);
    absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

    if (absEf > aec->errThresh) {
      absEf = aec->errThresh / (absEf + 1e-10f);
      ef[0][i] *= absEf;
      ef[1][i] *= absEf;
    }

    // Stepsize factor
    ef[0][i] *= aec->mu;
    ef[1][i] *= aec->mu;
  }
}
    SIMDValue SIMDFloat32x4Operation::OpReciprocalSqrt(const SIMDValue& value)
    {
        X86SIMDValue x86Result;
        X86SIMDValue temp;
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        temp.m128_value = _mm_div_ps(X86_ALL_ONES_F4.m128_value, v.m128_value); // temp = 1.0/value
        x86Result.m128_value = _mm_sqrt_ps(temp.m128_value);              // result = sqrt(1.0/value)

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Example #16
0
static inline __v4sf
sse_pow_1_24 (__v4sf x)
{
  __v4sf y, z;
  y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
  x = _mm_sqrt_ps (x);
  /* newton's method for x^(-1/6) */
  z = splat4f (1.f/6.f) * x;
  y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
  y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
  return x*y;
}
Example #17
0
float length() const {
	Vec3 a = *this;
	a.w = 0.0f;

	__m128 &D = a.m128;
	D = _mm_mul_ps(D, D);
	D = _mm_hadd_ps(D, D);
	D = _mm_hadd_ps(D, D);

	D = _mm_sqrt_ps(D);

	return a.x;
}
Example #18
0
/**
 * Identify bends in the chain, where the kappa angle (virtual bond angle from
 * c-alpha i-2, to i, to i+2) is greater than 70 degrees
 * dssp-2.2.0/structure.cpp:1729
 */
static std::vector<int> calculate_bends(const float* xyz, const int* ca_indices,
    const int* chain_ids, const int n_residues, std::vector<int>& skip)
{
    __m128 prev_ca, this_ca, next_ca, u_prime, v_prime, u, v;
    float kappa;
    std::vector<int> is_bend(n_residues, 0);
    for (int i = 2; i < n_residues-2; i++) {
        if (chain_ids[i-2] == chain_ids[i+2] && !skip[i-2] && !skip[i] && !skip[i+2]) {
            prev_ca = load_float3(xyz + 3*ca_indices[i-2]);
            this_ca = load_float3(xyz + 3*ca_indices[i]);
            next_ca = load_float3(xyz + 3*ca_indices[i+2]);
            u_prime = _mm_sub_ps(prev_ca, this_ca);
            v_prime = _mm_sub_ps(this_ca, next_ca);
            /* normalize the vectors u_prime and v_prime */
            u = _mm_div_ps(u_prime, _mm_sqrt_ps(_mm_dp_ps2(u_prime, u_prime, 0x7F)));
            v = _mm_div_ps(v_prime, _mm_sqrt_ps(_mm_dp_ps2(v_prime, v_prime, 0x7F)));
            /* compute the arccos of the dot product. this gives the angle */
            kappa = (float) acos(CLIP(_mm_cvtss_f32(_mm_dp_ps2(u, v, 0x71)), -1, 1));
            is_bend[i] = kappa > (70 * (M_PI / 180.0));
        }
    }
    return is_bend;
}
Example #19
0
/* the fast arctan function adopted from OpenCV */
static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len)
{
	int i = 0;
	float scale = (float)(180.0 / CCV_PI);
#ifdef HAVE_SSE2
#ifndef _WIN32
	union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff;
	__m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl);
	__m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2));
	__m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale);
	
	for(; i <= len - 4; i += 4)
	{
		__m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i);
		__m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4);
		__m128 xly = _mm_cmplt_ps(xq4, yq4);
		__m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps));

		// a4 <- x < y ? 90 : 0;
		__m128 a4 = _mm_and_ps(xly, _90);
		// a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0))
		__m128 mask = _mm_cmplt_ps(y4, zero);
		a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4));
		// a4 <- (x < 0 && !(x < y) ? 180 : a4)
		mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero));
		a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4));
		
		// a4 <- (x < y ? a4 - z4 : a4 + z4)
		a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4);
		__m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4));
		_mm_storeu_ps(angle + i, a4);
		_mm_storeu_ps(mag + i, m4);
	}
#endif
#endif
	for(; i < len; i++)
	{
		float xf = x[i], yf = y[i];
		float a, x2 = xf * xf, y2 = yf * yf;
		if(y2 <= x2)
			a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2);
		else
			a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6);
		angle[i] = a * scale;
		mag[i] = sqrtf(x2 + y2);
	}
}
Example #20
0
int dihedral(const float* xyz, const int* quartets, float* out,
          const int n_frames, const int n_atoms, const int n_quartets) {
  /* Compute the angle between sets of four atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     quartets : array, shape=(n_quartets, 3)
         The specific quartet of atoms whose angle you want to compute. The
         angle computed will be the torsion around the bound between the
         middle two elements (i.e aABCD). A 2d array of indices, in C order.
     out : array, shape=(n_frames, n_pairs)
         Array where the angles will be stored, in contiguous C order.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */

  int i, j;
  __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_quartets; j++) {
      x0 = load_float3(xyz + 3*quartets[4*j + 0]);
      x1 = load_float3(xyz + 3*quartets[4*j + 1]);
      x2 = load_float3(xyz + 3*quartets[4*j + 2]);
      x3 = load_float3(xyz + 3*quartets[4*j + 3]);

      b1 = _mm_sub_ps(x1, x0);
      b2 = _mm_sub_ps(x2, x1);
      b3 = _mm_sub_ps(x3, x2);

      c1 = cross(b2, b3);
      c2 = cross(b1, b2);

      p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71)));
      p2 = _mm_dp_ps(c1, c2, 0x71);

      *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2));
    };
    xyz += n_atoms*3;
  }
  return 1;
}
void sqrt_(register float *dst, register float *src, int w)
{
    register int j = 0;
#if CV_SSE
    if (CPU_SUPPORT_SSE1)
    {
        __m128 a;
        for (; j < w - 3; j += 4)
        {
            a = _mm_sqrt_ps(_mm_loadu_ps(src + j));
            _mm_storeu_ps(dst + j, a);
        }
    }
#endif
    for (; j < w; j++)
        dst[j] = sqrt(src[j]);
}
Example #22
0
void NBodyAlgorithm::calculateAccelerationWithColor(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4], unsigned int(&isClose)[4]) {
    __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x);
    __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y);
    __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z);

    __m128 pjx = _mm_set_ps1(posJ.x);
    __m128 pjy = _mm_set_ps1(posJ.y);
    __m128 pjz = _mm_set_ps1(posJ.z);

    __m128 rx = _mm_sub_ps(pjx, pix);
    __m128 ry = _mm_sub_ps(pjy, piy);
    __m128 rz = _mm_sub_ps(pjz, piz);

    __m128 eps2 = _mm_set_ps1(mp_properties->eps2);

    __m128 rx2 = _mm_mul_ps(rx, rx);
    __m128 ry2 = _mm_mul_ps(ry, ry);
    __m128 rz2 = _mm_mul_ps(rz, rz);
    __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2)));

    __m128 cmpDistance = _mm_set_ps1(float(mp_properties->positionScale));
    __m128 close = _mm_cmple_ps(rabs, cmpDistance);

    for (int i = 0; i < 4; i++) {
        if (close.m128_f32[i] == 0) {
            isClose[3 - i] = 0;
        }
    }

    __m128 m = _mm_set_ps1(massJ);
    __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs));

    __m128 aix = _mm_mul_ps(rx, rabsInv);
    __m128 aiy = _mm_mul_ps(ry, rabsInv);
    __m128 aiz = _mm_mul_ps(rz, rabsInv);

    for (int i = 0; i < 4; i++) {
        accI[3 - i].x = aix.m128_f32[i];
        accI[3 - i].y = aiy.m128_f32[i];
        accI[3 - i].z = aiz.m128_f32[i];
    }

}
Example #23
0
void fun_sse(float *a, float *b, int n)
{
    int i, k;
    __m128 x, z;
    __m128 *aa = (__m128 *)a;
    __m128 *bb = (__m128 *)b;

    k = n / 4;
    z = _mm_set_ps1(0.5f);

    for (i = 0; i < k; i++)
    {
        x = _mm_mul_ps(*aa, *aa);
        x = _mm_sqrt_ps(x);
        *bb = _mm_add_ps(x, z);
        aa++;
        bb++;
    }
}
Example #24
0
        template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col)
        {
            __m128 bestDot = _mm_setzero_ps();
            __m128i bestIndex = _mm_setzero_si128();
            for(int i = 0; i < buffer.size; ++i)
            {
                __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i]));
                __m128 mask = _mm_cmpgt_ps(dot, bestDot);
                bestDot = _mm_max_ps(dot, bestDot);
                bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex);

                dot = _mm_sub_ps(_mm_setzero_ps(), dot);
                mask = _mm_cmpgt_ps(dot, bestDot);
                bestDot = _mm_max_ps(dot, bestDot);
                bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex);
            }
            Store<align>((__m128i*)(buffer.index + col), bestIndex);
            Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy))));
        }
Example #25
0
double CHellingerKernel<float>::Evaluate(float* x, float* y) {


#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += sqrt(x[i]*y[i]);

    return static_cast<double>(result);

#else

    __m128* px = (__m128*)x;
    __m128* py = (__m128*)y;

    float zero = 0;
    __m128 sum = _mm_load1_ps(&zero);

    for(int i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_mul_ps(px[i],py[i]);
        temp = _mm_sqrt_ps(temp);
        sum = _mm_add_ps(sum,temp);

    }

    float result[4] = {0,0,0,0};
    _mm_storeu_ps(result,sum);

    float fresult  = result[0] + result[1] + result[2] + result[3];

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        fresult += sqrt(x[i]*y[i]);

    return static_cast<double>(fresult);

#endif

}
Example #26
0
// --------------------------------------------------------------
vuint32 mandelbrot_simd(vfloat32 a, vfloat32 b, size_t max_iter)
// --------------------------------------------------------------
{
	vuint32 num_iter = _mm_set1_epi32(0);
	vfloat32 zero=_mm_set1_ps(0);
    vfloat32 one=_mm_set1_ps(1);
	vfloat32 two=_mm_set1_ps(2);
	vfloat32 x=_mm_setzero_ps();
	vfloat32 y=_mm_setzero_ps();
	vfloat32 tmp;
	vfloat32 z;
	for(int i=0;i<max_iter;i++){	
	tmp=x;
	x=_mm_add_ps(a,_mm_sub_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y)));
	y=_mm_add_ps(b,_mm_mul_ps(_mm_mul_ps(y,tmp),two));	
	z=_mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y)));
	num_iter=_mm_add_epi32(num_iter,_mm_cvtps_epi32(_mm_and_ps(_mm_cmplt_ps(z,two),one)));
    }
	return num_iter;
}
Example #27
0
float sumar(float *a){
    float sumaF[4] __attribute__((aligned(16)));
    __m128 sumas= _mm_set1_ps(0);//suma alineada 1313 iniciadas en 0
    __m128 calculo;
    int i;
    __m128 aux;
    for ( i = 0; i < 100000; i+=4){
        //   multip =1;
        aux = _mm_load_ps(&a[i]);
        //corta el ciclo cuando el arreglo no tiene mas valores validos;
        //   calculo = sqrt(a[i]);
        calculo = _mm_sqrt_ps(aux);//se calcula la raiz cuadrada de los 4 float en paralelo
        calculo = _mm_pow2_ps(calculo,aux);// se decidió que por precicion de calculo se utilizará la funcion pow2
        if(_mm_compare_ps(aux,_mm_set1_ps(0)))break;
        sumas = _mm_add_ps(sumas, calculo);
        
    }
    _mm_store_ps(sumaF, sumas);
    return sumaF[0]+sumaF[1]+sumaF[2]+sumaF[3];
}
int test_sqrt()
{
	int Error(0);

#	if GLM_ARCH & GLM_ARCH_SSE2_BIT
	for(float f = 0.1f; f < 30.0f; f += 0.1f)
	{
		float r = _mm_cvtss_f32(_mm_sqrt_ps(_mm_set1_ps(f)));
		float s = std::sqrt(f);
		Error += glm::abs(r - s) < 0.01f ? 0 : 1;
		assert(!Error);
	}
#	endif//GLM_ARCH & GLM_ARCH_SSE2_BIT

	float A = glm::sqrt(10.f);
	glm::vec1 B = glm::sqrt(glm::vec1(10.f));
	glm::vec2 C = glm::sqrt(glm::vec2(10.f));
	glm::vec3 D = glm::sqrt(glm::vec3(10.f));
	glm::vec4 E = glm::sqrt(glm::vec4(10.f));

	return Error;
}
Example #29
0
int dist(const float* xyz, const int* pairs, float* distance_out,
         float* displacement_out, const int n_frames, const int n_atoms,
         const int n_pairs) {
  /* Compute the distance/displacement  between pairs of atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     pairs : array, shape=(n_pairs, 2)
         The specific pairs of atoms whose distance you want to compute. A 2d
         array of pairs, in C order.
     distance_out : array, shape=(n_frames, n_pairs), optional
         Array where the distances between pairs will be stored, in contiguous
         C order. If NULL is passed in, this return value will not be saved
     displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional
         An optional return value: if you'd also like to save the displacement
         vectors between the pairs, you can pass a pointer here. If
         displacement_out is NULL, then this variable will not be saved back
         to memory.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
   */

  int i, j;
  int store_displacement = displacement_out == NULL ? 0 : 1;
  int store_distance = distance_out == NULL ? 0 : 1;
  __m128 x1, x2, r12, r12_2, s;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_pairs; j++) {
      // Load the two vectors whos distance we want to compute
      // x1 = xyz[i, pairs[j,0], 0:3]
      // x2 = xyz[i, pairs[j,1], 0:3]
      x1 = load_float3(xyz + 3*pairs[2*j + 0]);
      x2 = load_float3(xyz + 3*pairs[2*j + 1]);

      // r12 = x2 - x1
      r12 = _mm_sub_ps(x2, x1);
      // r12_2 = r12*r12
      r12_2 = _mm_mul_ps(r12, r12);

      if (store_displacement) {
        // store the two lower entries (x,y) in memory
        _mm_storel_pi((__m64*)(displacement_out), r12);
        displacement_out += 2;
        // swap high-low and then store the z entry in the memory
        _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12));
      }
      if (store_distance) {
        // horizontal add the components of d2 with
        // two instructions. note: it's critical
        // here that the last entry of x1 and x2 was 0
        // so that d2.w = 0
        s = _mm_hadd_ps(r12_2, r12_2);
        s = _mm_hadd_ps(s, s);
        // sqrt our final answer
        s = _mm_sqrt_ps(s);

        // s now contains our answer in all four elements, because
        // of the way the hadd works. we only want to store one
        // element.
        _mm_store_ss(distance_out++, s);
      }
    }

    // advance to the next frame
    xyz += n_atoms*3;
  }

  return 1;
}
Example #30
0
int dist_mic(const float* xyz, const int* pairs, const float* box_matrix,
             float* distance_out, float* displacement_out,
             const int n_frames, const int n_atoms, const int n_pairs) {
  /* Compute the distance/displacement between pairs of atoms in every frame
     of xyz following the minimum image convention in periodic boundary
     conditions.

    The computation follows scheme B.9 in Tukerman, M. "Statistical
    Mechanics: Theory and Molecular Simulation", 2010.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     pairs : array, shape=(n_pairs, 2)
         The specific pairs of atoms whose distance you want to compute. A 2d
         array of pairs, in C order.
     box_matrix : array, shape=(3,3)
          The box matrix for a single frame. All of the frames are assumed to
          use this box vector.
     distance_out : array, shape=(n_frames, n_pairs)
         Array where the distances between pairs will be stored, in contiguous
         C order.
     displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional
         An optional return value: if you'd also like to save the displacement
         vectors between the pairs, you can pass a pointer here. If
         displacement_out is NULL, then this variable will not be saved back
         to memory.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */
#ifndef __SSE4_1__
   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
   int rounding_mode = _MM_GET_ROUNDING_MODE();
#endif

  int i, j;
  int store_displacement = displacement_out == NULL ? 0 : 1;
  int store_distance = distance_out == NULL ? 0 : 1;

  __m128 r1, r2, s12, r12, s, r12_2;
  __m128 hinv[3];
  __m128 h[3];

  for (i = 0; i < n_frames; i++) {
    // Store the columns of the box matrix in three float4s. This format
    // is fast for matrix * vector product. See, for example, this S.O. question:
    // http://stackoverflow.com/questions/14967969/efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-prod
    h[0] = _mm_setr_ps(box_matrix[0], box_matrix[3], box_matrix[6], 0.0f);
    h[1] = _mm_setr_ps(box_matrix[1], box_matrix[4], box_matrix[7], 0.0f);
    h[2] = _mm_setr_ps(box_matrix[2], box_matrix[5], box_matrix[8], 0.0f);
    // Calculate the inverse of the box matrix, and also store it in the same
    // format.
    inverse33(box_matrix, hinv+0, hinv+1, hinv+2);

    for (j = 0; j < n_pairs; j++) {
      // Load the two vectors whos distance we want to compute
      r1 = load_float3(xyz + 3*pairs[2*j + 0]);
      r2 = load_float3(xyz + 3*pairs[2*j + 1]);
      r12 =  _mm_sub_ps(r2, r1);

      // s12 = INVERSE(H) * r12
      s12 = _mm_add_ps(_mm_add_ps(
         _mm_mul_ps(hinv[0], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(0,0,0,0))),
         _mm_mul_ps(hinv[1], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(1,1,1,1)))),
         _mm_mul_ps(hinv[2], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(2,2,2,2))));

      // s12 = s12 - NEAREST_INTEGER(s12)
#ifdef __SSE4_1__
      s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT));
#else
      s12 = _mm_sub_ps(s12, _mm_cvtepi32_ps(_mm_cvtps_epi32(s12)));
#endif

      r12 = _mm_add_ps(_mm_add_ps(
          _mm_mul_ps(h[0], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(0,0,0,0))),
          _mm_mul_ps(h[1], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(1,1,1,1)))),
          _mm_mul_ps(h[2], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(2,2,2,2))));

      if (store_displacement) {
        // store the two lower entries (x,y) in memory
        _mm_storel_pi((__m64*)(displacement_out), r12);
        displacement_out += 2;
        // swap high-low and then store the z entry in the memory
        _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12));
      }
      if (store_distance) {
        // out = sqrt(sum(r12**2))
        r12_2 = _mm_mul_ps(r12, r12);
        s = _mm_hadd_ps(r12_2, r12_2);
        s = _mm_hadd_ps(s, s);
        s = _mm_sqrt_ps(s);
        _mm_store_ss(distance_out++, s);
      }
    }
    // advance to the next frame
    xyz += n_atoms*3;
    box_matrix += 9;
  }

#ifndef __SSE4_1__
   _MM_SET_ROUNDING_MODE(rounding_mode);
#endif
  return 1;
}