void sumSse(MATRIX_TYPE** first, MATRIX_TYPE** second, MATRIX_TYPE** result, size_t size) { for (size_t i = 0; i < size; i++) { for (size_t j = 0; j < size; j += 4) { __m128 matFirst = _mm_load_ps(&first[i][j]); __m128 matSecond = _mm_load_ps(&second[i][j]); _mm_store_ps(&result[i][j], _mm_add_ps(matFirst, matSecond)); } } }
void shz::math::matrix<shz::math::f32, 4, 4>::add(const shz::math::f32* left, const shz::math::f32 value, shz::math::f32* target){ __m128 b = _mm_load1_ps(&value); for(size_t i=0; i < 4; ++i){ __m128 a = _mm_load_ps(left); __m128 r = _mm_add_ps(a, b); _mm_store_ps(target, r); left+=4; target+=4; } }
void Detect32f(const HidHaarCascade & hid, size_t offset, const __m128 & norm, __m128i & result) { typedef HidHaarCascade Hid; const float * leaves = hid.leaves.data(); const Hid::Node * node = hid.nodes.data(); const Hid::Stage * stages = hid.stages.data(); for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) { const Hid::Stage & stage = stages[i]; if (stage.canSkip) continue; const Hid::Node * end = node + stage.ntrees; __m128 stageSum = _mm_setzero_ps(); if (stage.hasThree) { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); if (feature.rect[2].p0) sum = _mm_add_ps(sum, WeightedSum32f(feature.rect[2], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } else { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } result = _mm_andnot_si128(_mm_castps_si128(_mm_cmpgt_ps(_mm_set1_ps(stage.threshold), stageSum)), result); int resultCount = ResultCount(result); if (resultCount == 0) { return; } else if (resultCount == 1) { uint32_t SIMD_ALIGNED(16) _result[4]; float SIMD_ALIGNED(16) _norm[4]; _mm_store_si128((__m128i*)_result, result); _mm_store_ps(_norm, norm); for (int j = 0; j < 4; ++j) { if (_result[j]) { _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0; break; } } result = _mm_load_si128((__m128i*)_result); return; } }
SIMDValue SIMDUtils::FromSimdBits(const SIMDValue value) { X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); _mm_store_ps(x86Result.f32, v.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }
SIMDValue SIMDInt32x4Operation::OpFromFloat32x4Bits(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); _mm_store_ps(x86Result.simdValue.f32, v.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }
Point3D Point3D::operator-(const Vector3D& v) const { Point3D ret; __m128 thisv = _mm_load_ps(this->v); __m128 vv = _mm_load_ps(v.v); __m128 retv = _mm_sub_ps(thisv, vv); _mm_store_ps(ret.v, retv); return ret; }
void VertexFuncPosAttrib2( const VERTEXINPUT &in, VERTEXOUTPUT &out) { const FLOAT *pWorldMat = (FLOAT *)in.pConstants; Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs); _mm_store_ps(&out.pAttribs[0], _mm_load_ps(out.pVertices)); }
void AlignedSseMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 4) { __m128 v1 = _mm_load_ps(&a[i]); __m128 v2 = _mm_load_ps(&b[i]); __m128 r = _mm_mul_ps(v1, v2); _mm_store_ps(&d[i], r); } }
inline __m128 _mm_cosf(__m128 x) { float X[4]; _mm_store_ps(X,x); X[0] = cosf(X[0]); X[1] = cosf(X[1]); X[2] = cosf(X[2]); X[3] = cosf(X[3]); return _mm_load_ps(X); }
void test() { const int length = 4; float product[128*4] __attribute__ ((aligned(16))); __m128 x = _mm_set_ps(1.0f,2.0f,3.0f,4.0f); __m128 y = _mm_set_ps(1.0f,2.0f,3.0f,4.0f); __m128 z = _mm_add_ps(x,y); _mm_store_ps(product,z); fprintf(stderr, "%f %f %f %f\n", product[0], product[1], product[2], product[3]); }
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev) { /* CODE ADAPTED FROM boost/math/normal.hpp RealType exponent = x - mean; exponent *= -exponent; exponent /= 2 * sd * sd; result = exp(exponent); result /= sd * sqrt(2 * constants::pi<RealType>()); return result; */ const __m128& partitions = *(__m128*)_partitions; __m128 exponent, tmp, mean, sd; /* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */ static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); static const __m128 twos = _mm_set_ps1(2.0f); static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI)); // store mean and sd: mean = _mm_load_ps1(&_mean); sd = _mm_load_ps1(&_stdDev); // exponent = x - mean exponent = _mm_sub_ps(partitions, mean); // exponent *= -exponent; tmp = _mm_xor_ps(exponent, signmask); exponent = _mm_mul_ps(exponent, tmp); // exponent /= 2 * sd * sd; tmp = _mm_mul_ps(sd, sd); tmp = _mm_mul_ps(tmp, twos); exponent = _mm_div_ps(exponent, tmp); // exponent = exp(exponent); exponent = _mm_exp_ps(exponent); // exponent /= sd * sqrt(2 * pi) tmp = _mm_mul_ps(sd, sqrt_pi_2_s); tmp = _mm_div_ps(exponent, tmp); #ifndef NDEBUG const float* _result = (float*)&tmp; boost::math::normal_distribution<float> cNormal(_mean, _stdDev); assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f); assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f); assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f); assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f); #endif // return result: _mm_store_ps(result, tmp); };
// use MMX/SSE extensions (unrolled) void dotprod_rrrf_execute_sse4u(dotprod_rrrf _q, float * _x, float * _y) { __m128 v0, v1, v2, v3; __m128 h0, h1, h2, h3; __m128 s0, s1, s2, s3; __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/16)) unsigned int r = (_q->n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = _mm_loadu_ps(&_x[4*i+ 0]); v1 = _mm_loadu_ps(&_x[4*i+ 4]); v2 = _mm_loadu_ps(&_x[4*i+ 8]); v3 = _mm_loadu_ps(&_x[4*i+12]); // load coefficients into register (aligned) h0 = _mm_load_ps(&_q->h[4*i+ 0]); h1 = _mm_load_ps(&_q->h[4*i+ 4]); h2 = _mm_load_ps(&_q->h[4*i+ 8]); h3 = _mm_load_ps(&_q->h[4*i+12]); // compute dot products s0 = _mm_dp_ps(v0, h0, 0xffffffff); s1 = _mm_dp_ps(v1, h1, 0xffffffff); s2 = _mm_dp_ps(v2, h2, 0xffffffff); s3 = _mm_dp_ps(v3, h3, 0xffffffff); // parallel addition // FIXME: these additions are by far the limiting factor sum = _mm_add_ps( sum, s0 ); sum = _mm_add_ps( sum, s1 ); sum = _mm_add_ps( sum, s2 ); sum = _mm_add_ps( sum, s3 ); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); float total = w[0]; // cleanup for (i=4*r; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
static size_t sort_set (vox_dot set[], size_t n, size_t offset, int subspace, const vox_dot center) { size_t i, counter = offset; __v4sf center_, boundary_dot, dot; center_ = _mm_load_ps (center); boundary_dot = _mm_load_ps (set[offset]); for (i=offset; i<n; i++) { dot = _mm_load_ps (set[i]); if (get_subspace_idx_simd (center_, dot) == subspace) { _mm_store_ps (set[counter], dot); _mm_store_ps (set[i], boundary_dot); boundary_dot = _mm_load_ps (set[++counter]); } } return counter; }
void dist_price_sse(float *dist, float *prices, float *dist_price) { __m128 asse, bsse, csse; int i; for (i = 0; i < GRAPHSIZE; i += SSE_FOUR) { asse = _mm_load_ps(&dist[i]); bsse = _mm_load_ps(&prices[i]); csse = _mm_add_ps (asse, bsse); _mm_store_ps(&dist_price[i], csse); } }
// use MMX/SSE extensions void dotprod_crcf_execute_mmx(dotprod_crcf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // first cut: ... __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute multiplication s = _mm_mul_ps(v, h); // accumulate sum = _mm_add_ps(sum, s); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); // add in-phase and quadrature components w[0] += w[2]; w[1] += w[3]; // cleanup (note: n _must_ be even) for (; i<n; i+=2) { w[0] += x[i ] * _q->h[i ]; w[1] += x[i+1] * _q->h[i+1]; } // set return value *_y = w[0] + _Complex_I*w[1]; }
// use MMX/SSE extensions void dotprod_rrrf_execute_mmx(dotprod_rrrf _q, float * _x, float * _y) { // first cut: ... __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (_q->n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&_x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute multiplication s = _mm_mul_ps(v, h); // parallel addition sum = _mm_add_ps( sum, s ); } // aligned output array float w[4] __attribute__((aligned(16))); #if HAVE_PMMINTRIN_H // fold down into single value __m128 z = _mm_setzero_ps(); sum = _mm_hadd_ps(sum, z); sum = _mm_hadd_ps(sum, z); // unload single (lower value) _mm_store_ss(w, sum); float total = w[0]; #else // unload packed array _mm_store_ps(w, sum); float total = w[0] + w[1] + w[2] + w[3]; #endif // cleanup for (; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
__m128 _mm_parteentera_ps(__m128 a){//calcula parte para evtar problemas en la implementacion de la funcion _mm_pow_ps float f[4] __attribute__((aligned(16))); _mm_store_ps(f,a); f[0]=(int) f[0]; f[1]=(int) f[1]; f[2]=(int) f[2]; f[3]=(int) f[3]; return _mm_load_ps(&f[0]); }
void dct4x4_1d_llm_fwd_sse(float* s, float* d)//8add, 4 mul { const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); _mm_store_ps(d, _mm_add_ps(p03, p12)); _mm_store_ps(d + 4, _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12))); _mm_store_ps(d + 8, _mm_sub_ps(p03, p12)); _mm_store_ps(d + 12, _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12))); }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
void matrix_CpAAt_float (float* C,const float* A,size_t n,size_t p) { size_t i,j,k; size_t q = n / 8; size_t r = n % 8; for (k=0;k<p;k++) { float* pC = C; for (j=0;j<n;j++) { __m128 w = _mm_load1_ps (A+j+k*n); const float* pA = A+k*n; if (ALGEBRA_IS_ALIGNED(pA) && ALGEBRA_IS_ALIGNED(pC)) { for (i=0;i<q;i++) { __m128 i1 = _mm_load_ps(pA); __m128 i2 = _mm_load_ps(pA+4); __m128 o1 = _mm_load_ps(pC); __m128 o2 = _mm_load_ps(pC+4); _mm_store_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w))); _mm_store_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w))); pA += 8; pC += 8; } } else { for (i=0;i<q;i++) { __m128 i1 = _mm_loadu_ps(pA); __m128 i2 = _mm_loadu_ps(pA+4); __m128 o1 = _mm_loadu_ps(pC); __m128 o2 = _mm_loadu_ps(pC+4); _mm_storeu_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w))); _mm_storeu_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w))); pA += 8; pC += 8; } } for (i=0;i<r;i++) { (*pC++) += A[j+k*n]*(*pA++); } } } }
static void double_sample(stage_t * p, fifo_t * output_fifo) { sox_sample_t * output; int i, j, num_in = max(0, fifo_occupancy(&p->fifo)); rate_shared_t const * s = p->shared; dft_filter_t const * f = &s->half_band[p->which]; int const overlap = f->num_taps - 1; #ifdef SSE_ const float * const coeff = f->coefs; sox_sample_t tmp; __m128 coef, outp, sign; sign = SIGN; #endif while (p->rem + p->tuple * num_in >= f->dft_length) { div_t divd = div(f->dft_length - overlap - p->rem + p->tuple - 1, p->tuple); sox_sample_t const * input = fifo_read_ptr(&p->fifo); fifo_read(&p->fifo, divd.quot, NULL); num_in -= divd.quot; output = fifo_reserve_aligned(output_fifo, f->dft_length); fifo_trim_by(output_fifo, overlap); memset(output, 0, f->dft_length * sizeof(*output)); for (j = 0, i = p->rem; i < f->dft_length; ++j, i += p->tuple) output[i] = input[j]; p->rem = p->tuple - 1 - divd.rem; ff_rdft_x(f->dft_length, 1, output, f->tmp_buf); #ifdef SSE_ output[0] *= coeff[0]; output[1] *= coeff[1]; tmp = output[2]; output[2] = coeff[2] * tmp - coeff[3] * output[3]; output[3] = coeff[3] * tmp + coeff[2] * output[3]; for (i = 4; i < f->dft_length; i += 4) { outp = _mm_load_ps(output+i); coef = _mm_load_ps(coeff+i); _mm_store_ps(output+i, ZMUL2(outp, coef, sign)); } #else output[0] *= f->coefs[0]; output[1] *= f->coefs[1]; for (i = 2; i < f->dft_length; i += 2) { sox_sample_t tmp = output[i]; output[i ] = f->coefs[i ] * tmp - f->coefs[i+1] * output[i+1]; output[i+1] = f->coefs[i+1] * tmp + f->coefs[i ] * output[i+1]; } #endif ff_rdft_x(f->dft_length, -1, output, f->tmp_buf); } }
void GLVSPosColor( const VERTEXINPUT &in, VERTEXOUTPUT &out) { // Multiply pos by mvp const FLOAT *pMVP = (FLOAT *)in.pConstants; Mat4Vec3Multiply(out.pVertices, pMVP, (FLOAT *)&in.pAttribs[SHADER_INPUT_SLOT_POSITION]); // Pass color through __m128 vColor = _mm_load_ps(&in.pAttribs[SHADER_INPUT_SLOT_COLOR0]); _mm_store_ps(&out.pAttribs[0], vColor); }
void experienceNet::frequencyAdd_sse(float* cResult, const float* cVal, const float* addedVal) { // increase value: const __m128* addedVals = (__m128*)addedVal; const __m128* cVals = (__m128*)cVal; // result = cVal + addedVal __m128 result = _mm_add_ps(*cVals, *addedVals); // return result: _mm_store_ps(cResult, result); }
void dct4x4_1d_llm_inv_sse(float* s, float* d) { const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 t10 = _mm_add_ps(s0, s2); __m128 t12 = _mm_sub_ps(s0, s2); __m128 t0 = _mm_add_ps(_mm_mul_ps(c2, s1), _mm_mul_ps(c6, s3)); __m128 t2 = _mm_sub_ps(_mm_mul_ps(c6, s1), _mm_mul_ps(c2, s3)); _mm_store_ps(d, _mm_add_ps(t10, t0)); _mm_store_ps(d + 4, _mm_add_ps(t12, t2)); _mm_store_ps(d + 8, _mm_sub_ps(t12, t2)); _mm_store_ps(d + 12, _mm_sub_ps(t10, t0)); }
void VertexFuncConstColor( const VERTEXINPUT &in, VERTEXOUTPUT &out) { const FLOAT *pWorldMat = (FLOAT *)in.pConstants; const FLOAT *pMatColor = (FLOAT *)in.pConstants + CONST_OFFSET_MAT_COLOR; Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs); __m128 matColor = _mm_load_ps(pMatColor); _mm_store_ps(&out.pAttribs[0], matColor); }
void fDCT4x4_32f_and_threshold_and_iDCT4x4_32f(float* s, float threshold) { dct4x4_1d_llm_fwd_sse_and_transpose(s, s); #ifdef _KEEP_00_COEF_ fDCT2D4x4_and_threshold_keep00_32f(s, s, 4 * threshold); #else fDCT2D8x4_and_threshold_32f(s, s,4*threshold); #endif //ommiting transform //transpose4x4(s); dct4x4_1d_llm_inv_sse_and_transpose(s, s);//transpose4x4(s); dct4x4_1d_llm_inv_sse(s, s); //ommiting transform //transpose4x4(s); __m128 c = _mm_set1_ps(0.06250f); _mm_store_ps(s, _mm_mul_ps(_mm_load_ps(s), c)); _mm_store_ps(s + 4, _mm_mul_ps(_mm_load_ps(s + 4), c)); _mm_store_ps(s + 8, _mm_mul_ps(_mm_load_ps(s + 8), c)); _mm_store_ps(s + 12, _mm_mul_ps(_mm_load_ps(s + 12), c)); }
void ProxyRwSse2 <SplFmt_FLOAT>::write_flt_partial (const Ptr::Type &ptr, const __m128 &src0, const __m128 &src1, const __m128i &/*mask_lsb*/, const __m128i &/*sign_bit*/, const __m128 &/*offset*/, int len) { if (len >= 4) { _mm_store_ps (ptr, src0); fstb::ToolsSse2::store_ps_partial (ptr + 4, src1, len - 4); } else { fstb::ToolsSse2::store_ps_partial (ptr , src0, len ); } }
void nv_vector_normalize_L2(nv_matrix_t *v, int vm) { #if NV_ENABLE_SSE2 { const int i_lp = (v->n & 0xfffffffc); __m128 x, u; NV_ALIGNED(float, mm[4], 16); int i; float dp; u = _mm_setzero_ps(); for (i = 0; i < i_lp; i += 4) { x = _mm_load_ps(&NV_MAT_V(v, vm, i)); u = _mm_add_ps(u, _mm_mul_ps(x, x)); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (i = i_lp; i < v->n; ++i) { dp += NV_MAT_V(v, vm, i) * NV_MAT_V(v, vm, i); } if (dp > 0.0f) { x = _mm_set1_ps(1.0f / sqrtf(dp)); for (i = 0; i < i_lp; i += 4) { _mm_store_ps(&NV_MAT_V(v, vm, i), _mm_mul_ps(*(const __m128*)&NV_MAT_V(v, vm, i), x)); } for (i = i_lp; i < v->n; ++i) { NV_MAT_V(v, vm, i) *= dp; } } } #else float norm = nv_vector_norm(v, vm); if (norm > 0.0f) { float scale = 1.0f / norm; nv_vector_muls(v, vm, v, vm, scale); } #endif }
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); __m128 v = _mm_add_ps(p03, p12); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(d, v2); v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 4, v); v = _mm_sub_ps(p03, p12); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 8, v); v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 12, v); }
void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha) { __m128 s = _mm_set1_ps(alpha); __m128 v1234 = _mm_loadu_ps(u.data()); __m128 v56xx = _mm_loadu_ps(u.data() + 4); __m128 v1212 = _mm_movelh_ps(v1234, v1234); __m128 v3434 = _mm_movehl_ps(v1234, v1234); __m128 v5656 = _mm_movelh_ps(v56xx, v56xx); __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212)); _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212))); _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434))); _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656))); __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434)); _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434))); _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656))); __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656)); _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656))); }