示例#1
0
void sumSse(MATRIX_TYPE** first, MATRIX_TYPE** second, MATRIX_TYPE** result, size_t size) {
	for (size_t i = 0; i < size; i++) {
		for (size_t j = 0; j < size; j += 4) {
			__m128 matFirst = _mm_load_ps(&first[i][j]);
			__m128 matSecond = _mm_load_ps(&second[i][j]);
			_mm_store_ps(&result[i][j], _mm_add_ps(matFirst, matSecond));
		}
	}
}
示例#2
0
void shz::math::matrix<shz::math::f32, 4, 4>::add(const shz::math::f32* left, const shz::math::f32 value, shz::math::f32* target){
	__m128 b = _mm_load1_ps(&value);
	for(size_t i=0; i < 4; ++i){
		__m128 a = _mm_load_ps(left);
		__m128 r = _mm_add_ps(a, b);
		_mm_store_ps(target, r);
		left+=4; target+=4;
	}
}
示例#3
0
 void Detect32f(const HidHaarCascade & hid, size_t offset, const __m128 & norm, __m128i & result)
 {
     typedef HidHaarCascade Hid;
     const float * leaves = hid.leaves.data();
     const Hid::Node * node = hid.nodes.data();
     const Hid::Stage * stages = hid.stages.data();
     for (int i = 0, n = (int)hid.stages.size(); i < n; ++i)
     {
         const Hid::Stage & stage = stages[i];
         if (stage.canSkip)
             continue;
         const Hid::Node * end = node + stage.ntrees;
         __m128 stageSum = _mm_setzero_ps();
         if (stage.hasThree)
         {
             for (; node < end; ++node, leaves += 2)
             {
                 const Hid::Feature & feature = hid.features[node->featureIdx];
                 __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset));
                 if (feature.rect[2].p0)
                     sum = _mm_add_ps(sum, WeightedSum32f(feature.rect[2], offset));
                 StageSum32f(leaves, node->threshold, sum, norm, stageSum);
             }
         }
         else
         {
             for (; node < end; ++node, leaves += 2)
             {
                 const Hid::Feature & feature = hid.features[node->featureIdx];
                 __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset));
                 StageSum32f(leaves, node->threshold, sum, norm, stageSum);
             }
         }
         result = _mm_andnot_si128(_mm_castps_si128(_mm_cmpgt_ps(_mm_set1_ps(stage.threshold), stageSum)), result);
         int resultCount = ResultCount(result);
         if (resultCount == 0)
         {
             return;
         }
         else if (resultCount == 1)
         {
             uint32_t SIMD_ALIGNED(16) _result[4];
             float SIMD_ALIGNED(16) _norm[4];
             _mm_store_si128((__m128i*)_result, result);
             _mm_store_ps(_norm, norm);
             for (int j = 0; j < 4; ++j)
             {
                 if (_result[j])
                 {
                     _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0;
                     break;
                 }
             }
             result = _mm_load_si128((__m128i*)_result);
             return;
         }
     }
示例#4
0
    SIMDValue SIMDUtils::FromSimdBits(const SIMDValue value)
    {
        X86SIMDValue x86Result;
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        _mm_store_ps(x86Result.f32, v.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
    SIMDValue SIMDInt32x4Operation::OpFromFloat32x4Bits(const SIMDValue& value)
    {
        X86SIMDValue x86Result;
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        _mm_store_ps(x86Result.simdValue.f32, v.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
示例#6
0
文件: util.cpp 项目: shewu/raytracer
Point3D Point3D::operator-(const Vector3D& v) const
{
    Point3D ret;
    __m128 thisv = _mm_load_ps(this->v);
    __m128 vv = _mm_load_ps(v.v);
    __m128 retv = _mm_sub_ps(thisv, vv);
    _mm_store_ps(ret.v, retv);
    return ret;
}
示例#7
0
void VertexFuncPosAttrib2(
    const VERTEXINPUT &in,
    VERTEXOUTPUT &out)
{
    const FLOAT *pWorldMat = (FLOAT *)in.pConstants;

    Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs);

    _mm_store_ps(&out.pAttribs[0], _mm_load_ps(out.pVertices));
}
示例#8
0
void AlignedSseMult(float* d, float const* a, float const* b)
{
	for(int i = 0; i < gNumFloats; i += 4)
	{
		__m128 v1 = _mm_load_ps(&a[i]);
		__m128 v2 = _mm_load_ps(&b[i]);
		__m128 r = _mm_mul_ps(v1, v2);
		_mm_store_ps(&d[i], r);
	}
}
inline __m128 _mm_cosf(__m128 x)
{
	float X[4];
	_mm_store_ps(X,x);
	X[0] = cosf(X[0]);
	X[1] = cosf(X[1]);
	X[2] = cosf(X[2]);
	X[3] = cosf(X[3]);
	return _mm_load_ps(X);
}
示例#10
0
文件: test.cpp 项目: bebekifis/dpm
void test()
{
    const int length = 4;
    float product[128*4]  __attribute__ ((aligned(16)));
    __m128 x = _mm_set_ps(1.0f,2.0f,3.0f,4.0f);
    __m128 y = _mm_set_ps(1.0f,2.0f,3.0f,4.0f);
    __m128 z = _mm_add_ps(x,y);
    _mm_store_ps(product,z);
    fprintf(stderr, "%f %f %f %f\n", product[0], product[1], product[2], product[3]);
}
示例#11
0
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev)
{
	/* 
	CODE ADAPTED FROM boost/math/normal.hpp 

	RealType exponent = x - mean;
	exponent *= -exponent;
	exponent /= 2 * sd * sd;

	result = exp(exponent);
	result /= sd * sqrt(2 * constants::pi<RealType>());

	return result;
	*/
	const __m128& partitions = *(__m128*)_partitions;
	__m128 exponent, tmp, mean, sd;

	/* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */
	static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

	static const __m128 twos = _mm_set_ps1(2.0f);
	static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI));

	// store mean and sd:
	mean = _mm_load_ps1(&_mean);
	sd = _mm_load_ps1(&_stdDev);

	// exponent = x - mean
	exponent = _mm_sub_ps(partitions, mean);
	// exponent *= -exponent;
	tmp = _mm_xor_ps(exponent, signmask);
	exponent = _mm_mul_ps(exponent, tmp);
	// exponent /= 2 * sd * sd;
	tmp = _mm_mul_ps(sd, sd);
	tmp = _mm_mul_ps(tmp, twos);
	exponent = _mm_div_ps(exponent, tmp);
	// exponent = exp(exponent);
	exponent = _mm_exp_ps(exponent);
	// exponent /= sd * sqrt(2 * pi)
	tmp = _mm_mul_ps(sd, sqrt_pi_2_s);
	tmp = _mm_div_ps(exponent, tmp);


#ifndef NDEBUG
	const float* _result = (float*)&tmp;
	boost::math::normal_distribution<float> cNormal(_mean, _stdDev);
	assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f);
	assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f);
	assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f);
	assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f);
#endif

	// return result:
	_mm_store_ps(result, tmp);
};
示例#12
0
// use MMX/SSE extensions (unrolled)
void dotprod_rrrf_execute_sse4u(dotprod_rrrf _q,
                                float *      _x,
                                float *      _y)
{
    __m128 v0, v1, v2, v3;
    __m128 h0, h1, h2, h3;
    __m128 s0, s1, s2, s3;
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/16))
    unsigned int r = (_q->n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = _mm_loadu_ps(&_x[4*i+ 0]);
        v1 = _mm_loadu_ps(&_x[4*i+ 4]);
        v2 = _mm_loadu_ps(&_x[4*i+ 8]);
        v3 = _mm_loadu_ps(&_x[4*i+12]);

        // load coefficients into register (aligned)
        h0 = _mm_load_ps(&_q->h[4*i+ 0]);
        h1 = _mm_load_ps(&_q->h[4*i+ 4]);
        h2 = _mm_load_ps(&_q->h[4*i+ 8]);
        h3 = _mm_load_ps(&_q->h[4*i+12]);

        // compute dot products
        s0 = _mm_dp_ps(v0, h0, 0xffffffff);
        s1 = _mm_dp_ps(v1, h1, 0xffffffff);
        s2 = _mm_dp_ps(v2, h2, 0xffffffff);
        s3 = _mm_dp_ps(v3, h3, 0xffffffff);
        
        // parallel addition
        // FIXME: these additions are by far the limiting factor
        sum = _mm_add_ps( sum, s0 );
        sum = _mm_add_ps( sum, s1 );
        sum = _mm_add_ps( sum, s2 );
        sum = _mm_add_ps( sum, s3 );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0];

    // cleanup
    for (i=4*r; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
示例#13
0
static size_t sort_set (vox_dot set[], size_t n, size_t offset, int subspace, const vox_dot center)
{
    size_t i, counter = offset;
    __v4sf center_, boundary_dot, dot;
    center_ = _mm_load_ps (center);
    boundary_dot = _mm_load_ps (set[offset]);

    for (i=offset; i<n; i++)
    {
        dot = _mm_load_ps (set[i]);
        if (get_subspace_idx_simd (center_, dot) == subspace)
        {
            _mm_store_ps (set[counter], dot);
            _mm_store_ps (set[i], boundary_dot);
            boundary_dot = _mm_load_ps (set[++counter]);
        }
    }

    return counter;
}
示例#14
0
文件: auction-sse.c 项目: kjrz/porr
void dist_price_sse(float *dist, float *prices, float *dist_price) {
    __m128 asse, bsse, csse;
    int i;

    for (i = 0; i < GRAPHSIZE; i += SSE_FOUR) {
        asse = _mm_load_ps(&dist[i]);
        bsse = _mm_load_ps(&prices[i]);
        csse = _mm_add_ps (asse, bsse);
        _mm_store_ps(&dist_price[i], csse);
    }
}
示例#15
0
// use MMX/SSE extensions
void dotprod_crcf_execute_mmx(dotprod_crcf _q,
                              float complex * _x,
                              float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // first cut: ...
    __m128 v;   // input vector
    __m128 h;   // coefficients vector
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps();  // load zeros into sum register

    // t = 4*(floor(_n/4))
    unsigned int t = (n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        v = _mm_loadu_ps(&x[i]);

        // load coefficients into register (aligned)
        h = _mm_load_ps(&_q->h[i]);

        // compute multiplication
        s = _mm_mul_ps(v, h);

        // accumulate
        sum = _mm_add_ps(sum, s);
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

    // unload packed array
    _mm_store_ps(w, sum);

    // add in-phase and quadrature components
    w[0] += w[2];
    w[1] += w[3];

    // cleanup (note: n _must_ be even)
    for (; i<n; i+=2) {
        w[0] += x[i  ] * _q->h[i  ];
        w[1] += x[i+1] * _q->h[i+1];
    }

    // set return value
    *_y = w[0] + _Complex_I*w[1];
}
示例#16
0
// use MMX/SSE extensions
void dotprod_rrrf_execute_mmx(dotprod_rrrf _q,
                              float *      _x,
                              float *      _y)
{
    // first cut: ...
    __m128 v;   // input vector
    __m128 h;   // coefficients vector
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/4))
    unsigned int t = (_q->n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        v = _mm_loadu_ps(&_x[i]);

        // load coefficients into register (aligned)
        h = _mm_load_ps(&_q->h[i]);

        // compute multiplication
        s = _mm_mul_ps(v, h);
       
        // parallel addition
        sum = _mm_add_ps( sum, s );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

#if HAVE_PMMINTRIN_H
    // fold down into single value
    __m128 z = _mm_setzero_ps();
    sum = _mm_hadd_ps(sum, z);
    sum = _mm_hadd_ps(sum, z);
   
    // unload single (lower value)
    _mm_store_ss(w, sum);
    float total = w[0];
#else
    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0] + w[1] + w[2] + w[3];
#endif

    // cleanup
    for (; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
示例#17
0
__m128 _mm_parteentera_ps(__m128 a){//calcula parte  para evtar problemas en la implementacion de la funcion _mm_pow_ps
    float f[4] __attribute__((aligned(16)));
    _mm_store_ps(f,a);
    
    f[0]=(int) f[0];
    f[1]=(int) f[1];
    f[2]=(int) f[2];
    f[3]=(int) f[3];

    return _mm_load_ps(&f[0]);
}
示例#18
0
void dct4x4_1d_llm_fwd_sse(float* s, float* d)//8add, 4 mul
{
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	_mm_store_ps(d, _mm_add_ps(p03, p12));
	_mm_store_ps(d + 4, _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)));
	_mm_store_ps(d + 8, _mm_sub_ps(p03, p12));
	_mm_store_ps(d + 12, _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)));
}
示例#19
0
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}
示例#20
0
void	matrix_CpAAt_float (float* C,const float* A,size_t n,size_t p)
{
    size_t i,j,k;
    size_t q = n / 8;
    size_t r = n % 8;

    for (k=0;k<p;k++) {
        float* pC = C;
        for (j=0;j<n;j++) {
            __m128 w = _mm_load1_ps (A+j+k*n);
            const float* pA = A+k*n;
            if (ALGEBRA_IS_ALIGNED(pA) && ALGEBRA_IS_ALIGNED(pC)) {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_load_ps(pA);
                    __m128 i2 = _mm_load_ps(pA+4);
                    __m128 o1 = _mm_load_ps(pC);
                    __m128 o2 = _mm_load_ps(pC+4);
                    _mm_store_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_store_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            else {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_loadu_ps(pA);
                    __m128 i2 = _mm_loadu_ps(pA+4);
                    __m128 o1 = _mm_loadu_ps(pC);
                    __m128 o2 = _mm_loadu_ps(pC+4);
                    _mm_storeu_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_storeu_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            for (i=0;i<r;i++) {
                (*pC++) += A[j+k*n]*(*pA++);
            }
        }
    }
}
static void double_sample(stage_t * p, fifo_t * output_fifo)
{
  sox_sample_t * output;
  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
  rate_shared_t const * s = p->shared;
  dft_filter_t const * f = &s->half_band[p->which];
  int const overlap = f->num_taps - 1;
#ifdef SSE_
  const float * const coeff = f->coefs;
  sox_sample_t tmp;
  __m128 coef, outp, sign;
  sign = SIGN;
#endif

  while (p->rem + p->tuple * num_in >= f->dft_length) {
    div_t divd = div(f->dft_length - overlap - p->rem + p->tuple - 1, p->tuple);
    sox_sample_t const * input = fifo_read_ptr(&p->fifo);
    fifo_read(&p->fifo, divd.quot, NULL);
    num_in -= divd.quot;

    output = fifo_reserve_aligned(output_fifo, f->dft_length);
    fifo_trim_by(output_fifo, overlap);
    memset(output, 0, f->dft_length * sizeof(*output));
    for (j = 0, i = p->rem; i < f->dft_length; ++j, i += p->tuple)
      output[i] = input[j];
    p->rem = p->tuple - 1 - divd.rem;

    ff_rdft_x(f->dft_length, 1, output, f->tmp_buf);
#ifdef SSE_
    output[0] *= coeff[0];
    output[1] *= coeff[1];
    tmp = output[2];
    output[2] = coeff[2] * tmp - coeff[3] * output[3];
    output[3] = coeff[3] * tmp + coeff[2] * output[3];

    for (i = 4; i < f->dft_length; i += 4)
    {
      outp = _mm_load_ps(output+i);
      coef = _mm_load_ps(coeff+i);
      _mm_store_ps(output+i, ZMUL2(outp, coef, sign));
    }
#else
    output[0] *= f->coefs[0];
    output[1] *= f->coefs[1];
    for (i = 2; i < f->dft_length; i += 2) {
      sox_sample_t tmp = output[i];
      output[i  ] = f->coefs[i  ] * tmp - f->coefs[i+1] * output[i+1];
      output[i+1] = f->coefs[i+1] * tmp + f->coefs[i  ] * output[i+1];
    }
#endif
    ff_rdft_x(f->dft_length, -1, output, f->tmp_buf);
  }
}
示例#22
0
void GLVSPosColor(
    const VERTEXINPUT &in,
    VERTEXOUTPUT &out)
{
    // Multiply pos by mvp
    const FLOAT *pMVP = (FLOAT *)in.pConstants;
    Mat4Vec3Multiply(out.pVertices, pMVP, (FLOAT *)&in.pAttribs[SHADER_INPUT_SLOT_POSITION]);

    // Pass color through
    __m128 vColor = _mm_load_ps(&in.pAttribs[SHADER_INPUT_SLOT_COLOR0]);
    _mm_store_ps(&out.pAttribs[0], vColor);
}
示例#23
0
void experienceNet::frequencyAdd_sse(float* cResult, const float* cVal, const float* addedVal)
{
	// increase value:
	const __m128* addedVals = (__m128*)addedVal;
	const __m128* cVals = (__m128*)cVal;

	// result = cVal + addedVal
	__m128 result = _mm_add_ps(*cVals, *addedVals);

	// return result:
	_mm_store_ps(cResult, result);
}
示例#24
0
void dct4x4_1d_llm_inv_sse(float* s, float* d)
{
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 t10 = _mm_add_ps(s0, s2);
	__m128 t12 = _mm_sub_ps(s0, s2);

	__m128 t0 = _mm_add_ps(_mm_mul_ps(c2, s1), _mm_mul_ps(c6, s3));
	__m128 t2 = _mm_sub_ps(_mm_mul_ps(c6, s1), _mm_mul_ps(c2, s3));

	_mm_store_ps(d, _mm_add_ps(t10, t0));
	_mm_store_ps(d + 4, _mm_add_ps(t12, t2));
	_mm_store_ps(d + 8, _mm_sub_ps(t12, t2));
	_mm_store_ps(d + 12, _mm_sub_ps(t10, t0));
}
示例#25
0
void VertexFuncConstColor(
    const VERTEXINPUT &in,
    VERTEXOUTPUT &out)
{
    const FLOAT *pWorldMat = (FLOAT *)in.pConstants;
    const FLOAT *pMatColor = (FLOAT *)in.pConstants + CONST_OFFSET_MAT_COLOR;

    Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs);

    __m128 matColor = _mm_load_ps(pMatColor);
    _mm_store_ps(&out.pAttribs[0], matColor);
}
示例#26
0
void fDCT4x4_32f_and_threshold_and_iDCT4x4_32f(float* s, float threshold)
{
	dct4x4_1d_llm_fwd_sse_and_transpose(s, s);
#ifdef _KEEP_00_COEF_
	fDCT2D4x4_and_threshold_keep00_32f(s, s, 4 * threshold);
#else
	fDCT2D8x4_and_threshold_32f(s, s,4*threshold);
#endif
	//ommiting transform
	//transpose4x4(s);
	dct4x4_1d_llm_inv_sse_and_transpose(s, s);//transpose4x4(s);
	dct4x4_1d_llm_inv_sse(s, s);
	//ommiting transform
	//transpose4x4(s);

	__m128 c = _mm_set1_ps(0.06250f);
	_mm_store_ps(s, _mm_mul_ps(_mm_load_ps(s), c));
	_mm_store_ps(s + 4, _mm_mul_ps(_mm_load_ps(s + 4), c));
	_mm_store_ps(s + 8, _mm_mul_ps(_mm_load_ps(s + 8), c));
	_mm_store_ps(s + 12, _mm_mul_ps(_mm_load_ps(s + 12), c));
}
示例#27
0
void	ProxyRwSse2 <SplFmt_FLOAT>::write_flt_partial (const Ptr::Type &ptr, const __m128 &src0, const __m128 &src1, const __m128i &/*mask_lsb*/, const __m128i &/*sign_bit*/, const __m128 &/*offset*/, int len)
{
	if (len >= 4)
	{
		_mm_store_ps (ptr, src0);
		fstb::ToolsSse2::store_ps_partial (ptr + 4, src1, len - 4);
	}
	else
	{
		fstb::ToolsSse2::store_ps_partial (ptr    , src0, len    );
	}
}
示例#28
0
void 
nv_vector_normalize_L2(nv_matrix_t *v, int vm)
{
#if NV_ENABLE_SSE2
	{
		const int i_lp = (v->n & 0xfffffffc);
		__m128 x, u;
		NV_ALIGNED(float, mm[4], 16);
		int i;
		float dp;
		
		u = _mm_setzero_ps();
		for (i = 0; i < i_lp; i += 4) {
			x = _mm_load_ps(&NV_MAT_V(v, vm, i));
			u = _mm_add_ps(u, _mm_mul_ps(x, x));
		}
		_mm_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3];
		for (i = i_lp; i < v->n; ++i) {
			dp += NV_MAT_V(v, vm, i) * NV_MAT_V(v, vm, i);
		}
		if (dp > 0.0f) {
			x = _mm_set1_ps(1.0f / sqrtf(dp));
			for (i = 0; i < i_lp; i += 4) {
				_mm_store_ps(&NV_MAT_V(v, vm, i),
							 _mm_mul_ps(*(const __m128*)&NV_MAT_V(v, vm, i), x));
			}
			for (i = i_lp; i < v->n; ++i) {
				NV_MAT_V(v, vm, i) *= dp;
			}
		}
	}
#else
	float norm = nv_vector_norm(v, vm);
	if (norm > 0.0f) {
		float scale = 1.0f / norm;
		nv_vector_muls(v, vm, v, vm, scale);
	}
#endif
}
示例#29
0
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh)
{
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();
	const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2);
	const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2);

	__m128 s0 = _mm_load_ps(s); s += 4;
	__m128 s1 = _mm_load_ps(s); s += 4;
	__m128 s2 = _mm_load_ps(s); s += 4;
	__m128 s3 = _mm_load_ps(s);

	__m128 p03 = _mm_add_ps(s0, s3);
	__m128 p12 = _mm_add_ps(s1, s2);
	__m128 m03 = _mm_sub_ps(s0, s3);
	__m128 m12 = _mm_sub_ps(s1, s2);

	__m128 v = _mm_add_ps(p03, p12);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(d, v2);

	v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 4, v);

	v = _mm_sub_ps(p03, p12);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 8, v);

	v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12));
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(d + 12, v);
}
void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha)
{
  __m128 s = _mm_set1_ps(alpha);
  __m128 v1234 = _mm_loadu_ps(u.data());
  __m128 v56xx = _mm_loadu_ps(u.data() + 4);

  __m128 v1212 = _mm_movelh_ps(v1234, v1234);
  __m128 v3434 = _mm_movehl_ps(v1234, v1234);
  __m128 v5656 = _mm_movelh_ps(v56xx, v56xx);

  __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212));

  _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212)));
  _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434)));
  _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656)));

  __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434));

  _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434)));
  _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656)));

  __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656));

  _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656)));
}