void SubVectorsSIMD(float* c, const float* a, const float* b, std::size_t n) { std::size_t i = 0; for (; i < ROUND_DOWN(n, 4); i += 4) { __m128 ma = _mm_loadu_ps(a + i); __m128 mb = _mm_loadu_ps(b + i); __m128 mc = _mm_sub_ps(ma, mb); _mm_storeu_ps(c + i, mc); } for (; i < n; i++) { c[i] = a[i] - b[i]; } }
void dct4x4_1d_llm_inv_sse(float* s, float* d) { const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 t10 = _mm_add_ps(s0, s2); __m128 t12 = _mm_sub_ps(s0, s2); __m128 t0 = _mm_add_ps(_mm_mul_ps(c2, s1), _mm_mul_ps(c6, s3)); __m128 t2 = _mm_sub_ps(_mm_mul_ps(c6, s1), _mm_mul_ps(c2, s3)); _mm_store_ps(d, _mm_add_ps(t10, t0)); _mm_store_ps(d + 4, _mm_add_ps(t12, t2)); _mm_store_ps(d + 8, _mm_sub_ps(t12, t2)); _mm_store_ps(d + 12, _mm_sub_ps(t10, t0)); }
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for (; i <= width - 16; i += 16) { __m256 f, s0 = d8, s1 = d8; __m256 x0; S = src[0] + i; for (k = 1; k <= ksize2; k++) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for (; i <= width - 4; i += 4) { __m128 f, x0, s0 = d4; for (k = 1; k <= ksize2; k++) { f = _mm_set1_ps(ky[k]); x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
/** Compute an approximate sine (SSE version, four sines a call). * This function behaves correctly for the range [-pi pi] only. * It has the following properties: * <ul> * <li>It has exact values for 0, pi/2, pi, -pi/2, -pi</li> * <li>It has matching derivatives to sine for these same points</li> * <li>Its relative error margin is <= 1% iirc</li> * <li>It computational cost is 5 mults + 3 adds + 2 abs</li> * </ul> * @param t Radian parameter * @return guess what */ static inline __m128 sinf_fast_sse(__m128 t) { static const __m128 a = {4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI)}; static const __m128 p = {0.225f, 0.225f, 0.225f, 0.225f}; static const __m128 pi = {M_PI, M_PI, M_PI, M_PI}; // m4 = a*t*(M_PI - fabsf(t)); __m128 m1 = _mm_abs_ps(t); __m128 m2 = _mm_sub_ps(pi, m1); __m128 m3 = _mm_mul_ps(t, m2); __m128 m4 = _mm_mul_ps(a, m3); // p*(m4*fabsf(m4) - m4) + m4; __m128 n1 = _mm_abs_ps(m4); __m128 n2 = _mm_mul_ps(m4, n1); __m128 n3 = _mm_sub_ps(n2, m4); __m128 n4 = _mm_mul_ps(p, n3); return _mm_add_ps(n4, m4); }
ArrayFloat MathlibSSE2::Cos4( ArrayFloat x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); return cos_ps( x ); }
void fDCT2D4x4_and_threshold_keep00_32f(float* s, float* d, float thresh) { const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); const __m128 zeros = _mm_setzero_ps(); const __m128 c2 = _mm_set1_ps(1.30656f);//cos(CV_PI*2/16.0)*sqrt(2); const __m128 c6 = _mm_set1_ps(0.541196);//cos(CV_PI*6/16.0)*sqrt(2); __m128 s0 = _mm_load_ps(s); s += 4; __m128 s1 = _mm_load_ps(s); s += 4; __m128 s2 = _mm_load_ps(s); s += 4; __m128 s3 = _mm_load_ps(s); __m128 p03 = _mm_add_ps(s0, s3); __m128 p12 = _mm_add_ps(s1, s2); __m128 m03 = _mm_sub_ps(s0, s3); __m128 m12 = _mm_sub_ps(s1, s2); __m128 v = _mm_add_ps(p03, p12); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); // keep 00 coef. __m128 v2 = _mm_blendv_ps(zeros, v, msk); v2 = _mm_blend_ps(v2, v, 1); _mm_store_ps(d, v2); v = _mm_add_ps(_mm_mul_ps(c2, m03), _mm_mul_ps(c6, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 4, v); v = _mm_sub_ps(p03, p12); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 8, v); v = _mm_sub_ps(_mm_mul_ps(c6, m03), _mm_mul_ps(c2, m12)); msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth); v = _mm_blendv_ps(zeros, v, msk); _mm_store_ps(d + 12, v); }
// ---------------------------------------------------------- // Name: matrix::Determinant // Desc: Return the matrix determinant. A = det[M]. // ---------------------------------------------------------- float matrix::Determinant() { #ifdef _M_IX86 __m128 Va,Vb,Vc; __m128 r1,r2,r3,t1,t2,sum; F32vec4 Det; // First, Let's calculate the first four minterms of // the first line t1 = _L4; t2 = _mm_ror_ps(_L3,1); // V3'·V4 Vc = _mm_mul_ps(t2,_mm_ror_ps(t1,0)); // V3'·V4" Va = _mm_mul_ps(t2,_mm_ror_ps(t1,2)); // V3'·V4^ Vb = _mm_mul_ps(t2,_mm_ror_ps(t1,3)); // V3"·V4^ - V3^·V4" r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3^·V4' - V3'·V4^ r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3'·V4" - V3"·V4' r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); Va = _mm_ror_ps(_L2,1); sum = _mm_mul_ps(Va,r1); Vb = _mm_ror_ps(Va,1); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2)); Vc = _mm_ror_ps(Vb,1); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3)); // Now we can calculate the determinant: Det = _mm_mul_ps(sum,_L1); Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det)); Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1)); return Det[0]; #else // TODO return 0.0f; #endif // _M_IX86 }
void btSequentialImpulseConstraintSolver::resolveSplitPenetrationSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) { #ifdef USE_SIMD if (!c.m_rhsPenetration) return; gNumSplitImpulseRecoveries++; __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedPushImpulse); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); __m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse),_mm_set1_ps(c.m_cfm))); __m128 deltaVel1Dotn = _mm_add_ps(_vmathVfDot3(c.m_contactNormal.mVec128,body1.m_pushVelocity.mVec128), _vmathVfDot3(c.m_relpos1CrossNormal.mVec128,body1.m_turnVelocity.mVec128)); __m128 deltaVel2Dotn = _mm_sub_ps(_vmathVfDot3(c.m_relpos2CrossNormal.mVec128,body2.m_turnVelocity.mVec128),_vmathVfDot3((c.m_contactNormal).mVec128,body2.m_pushVelocity.mVec128)); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); btSimdScalar resultLowerLess,resultUpperLess; resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.m_invMass.mVec128); __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.m_invMass.mVec128); __m128 impulseMagnitude = deltaImpulse; body1.m_pushVelocity.mVec128 = _mm_add_ps(body1.m_pushVelocity.mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); body1.m_turnVelocity.mVec128 = _mm_add_ps(body1.m_turnVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); body2.m_pushVelocity.mVec128 = _mm_sub_ps(body2.m_pushVelocity.mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); body2.m_turnVelocity.mVec128 = _mm_add_ps(body2.m_turnVelocity.mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); #else resolveSplitPenetrationImpulseCacheFriendly(body1,body2,c); #endif }
void btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btRigidBody& body1,btRigidBody& body2,const btSolverConstraint& c) { #ifdef USE_SIMD __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); __m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm))); __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128)); __m128 deltaVel2Dotn = _mm_sub_ps(btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128),btSimdDot3((c.m_contactNormal).mVec128,body2.internalGetDeltaLinearVelocity().mVec128)); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); btSimdScalar resultLowerLess,resultUpperLess; resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128,body1.internalGetInvMass().mVec128); __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128,body2.internalGetInvMass().mVec128); __m128 impulseMagnitude = deltaImpulse; body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); #else resolveSingleConstraintRowLowerLimit(body1,body2,c); #endif }
SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws) { X86SIMDValue x86Result = { 0 }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue temp, temp2; X86SIMDValue two_31_f4, two_31_i4; int mask = 0; // any lanes < 0 ? temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value); mask = _mm_movemask_ps(temp.m128_value); // negative value are out of range, caller should throw Range Error if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31. // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back. _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value); // any lanes >= 2^31 ? temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value); // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value); // subtract 2^31 from lanes >= 2^31, unchanged otherwise. v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value); // CVTTPS2DQ x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value); // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment) temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ? mask = _mm_movemask_ps(temp2.m128_value); if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // we pass range check // add 2^31 values back to adjusted values. // Use first bit from the 2^31 float mask (0x4f000...0 << 1) // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value); two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1); two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value); // add 2^31 back to adjusted values // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow. x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
INLINE const SVec4 &SVec4::operator-=(const SVec4 &rhs) { #ifdef USE_SSE m_128 = _mm_sub_ps( m_128, rhs.m_128 ); #else m_x -= rhs.X(); m_y -= rhs.Y(); m_z -= rhs.Z(); m_w -= rhs.W(); #endif return *this; }
INLINE SVec4 SVec4::operator-(const SVec4 &rhs) const { #ifdef USE_SSE return SVec4( _mm_sub_ps( m_128, rhs.m_128 ) ); #else return { m_x - rhs.X(), m_y - rhs.Y(), m_z - rhs.Z(), m_w - rhs.W() }; #endif }
float interpolate_tricubic_SSE(float* tex, float3 coord, uint3 volumeExtent) { // transform the coordinate from [0,extent] to [-0.5, extent-0.5] const __m128 coord_grid = _mm_sub_ps(_mm_set_ps(coord.x, coord.y, coord.z, 0.5f), _mm_set1_ps(0.5f)); //coord_grid = coord - 0.5f; __m128 indexF = _mm_cvtepi32_ps(_mm_cvttps_epi32(coord_grid)); //indexF = floor(coord_grid); const __m128 fraction = _mm_sub_ps(coord_grid, indexF); //fraction = coord_grid - indexF; // clamp between 1 and volumeExtent-3 indexF = _mm_max_ps(indexF, _mm_set1_ps(1.0f)); indexF = _mm_min_ps(indexF, _mm_cvtepi32_ps( _mm_sub_epi32(_mm_set_epi32(volumeExtent.x, volumeExtent.y, volumeExtent.z, 4), _mm_set1_epi32(3)))); // note that x,y,z are located in registers 3,2,1 __m128 bspline_x = bsplineSSE(fraction.m128_f32[3]); __m128 bspline_y = bsplineSSE(fraction.m128_f32[2]); __m128 bspline_z = bsplineSSE(fraction.m128_f32[1]); // load the data __m128 m0[16]; __m128i index = _mm_sub_epi32(_mm_cvttps_epi32(indexF), _mm_set1_epi32(1)); //index = indexF - 1 const float* p0 = tex + (index.m128i_i32[1] * volumeExtent.y + index.m128i_i32[2]) * volumeExtent.x + index.m128i_i32[3]; const size_t slice = volumeExtent.x * volumeExtent.y; for (int z=0, i=0; z<4; z++) { const float* p1 = p0 + z * slice; for (int y=0; y<4; y++, i++) { m0[i] = _mm_set_ps(p1[0], p1[1], p1[2], p1[3]); p1 += volumeExtent.x; } } // convolution __m128 m1[4] = { convolute_loop(bspline_x, m0), convolute_loop(bspline_x, m0+4), convolute_loop(bspline_x, m0+8), convolute_loop(bspline_x, m0+12)}; return dot_product(bspline_z, convolute_loop(bspline_y, m1) ); }
inline wg_v4sf Recognizer::local_distance4(float *s, float *t0, float *t1, float *t2, float *t3) { wg_v4sf v_, v0, v1, v2, v3; v_.v = _mm_set_ps1(-0.0); v0.v = _mm_sub_ps(((wg_v4sf *)t0)->v, ((wg_v4sf *)s)->v); v0.v = _mm_andnot_ps(v_.v,v0.v); // absolute value v1.v = _mm_sub_ps(((wg_v4sf *)t1)->v, ((wg_v4sf *)s)->v); v1.v = _mm_andnot_ps(v_.v,v1.v); // absolute value v2.v = _mm_sub_ps(((wg_v4sf *)t2)->v, ((wg_v4sf *)s)->v); v2.v = _mm_andnot_ps(v_.v,v2.v); // absolute value v3.v = _mm_sub_ps(((wg_v4sf *)t3)->v, ((wg_v4sf *)s)->v); v3.v = _mm_andnot_ps(v_.v,v3.v); // absolute value // convert row vectors to column vectors _MM_TRANSPOSE4_PS(v0.v, v1.v, v2.v, v3.v); v3.v = _mm_add_ps(v3.v, v2.v); v3.v = _mm_add_ps(v3.v, v1.v); v3.v = _mm_add_ps(v3.v, v0.v); return v3; }
void operator()(CONTAINER *target, const CONTAINER& oldSelf, const CONTAINER& neighbor) { __m128 forceOffset = _mm_set1_ps(FORCE_OFFSET); #ifndef NO_OMP #pragma omp parallel for schedule(static) #endif for (int j = 0; j < CONTAINER_SIZE; ++j) { __m128 neighborPosX = _mm_set1_ps(neighbor.posX[j]); __m128 neighborPosY = _mm_set1_ps(neighbor.posY[j]); __m128 neighborPosZ = _mm_set1_ps(neighbor.posZ[j]); for (int i = 0; i < CONTAINER_SIZE; i+=4) { __m128 oldSelfPosX = _mm_load_ps(oldSelf.posX + i); __m128 oldSelfPosY = _mm_load_ps(oldSelf.posY + i); __m128 oldSelfPosZ = _mm_load_ps(oldSelf.posZ + i); __m128 myVelX = _mm_load_ps(oldSelf.velX + i); __m128 myVelY = _mm_load_ps(oldSelf.velY + i); __m128 myVelZ = _mm_load_ps(oldSelf.velZ + i); __m128 deltaX = _mm_sub_ps(oldSelfPosX, neighborPosX); __m128 deltaY = _mm_sub_ps(oldSelfPosY, neighborPosY); __m128 deltaZ = _mm_sub_ps(oldSelfPosZ, neighborPosZ); __m128 dist2 = _mm_add_ps(forceOffset, _mm_mul_ps(deltaX, deltaX)); dist2 = _mm_add_ps(dist2, _mm_mul_ps(deltaY, deltaY)); dist2 = _mm_add_ps(dist2, _mm_mul_ps(deltaZ, deltaZ)); __m128 force = _mm_rsqrt_ps(dist2); myVelX = _mm_add_ps(myVelX, _mm_mul_ps(force, deltaX)); myVelY = _mm_add_ps(myVelY, _mm_mul_ps(force, deltaY)); myVelZ = _mm_add_ps(myVelZ, _mm_mul_ps(force, deltaZ)); _mm_store_ps(target->velX + i, myVelX); _mm_store_ps(target->velY + i, myVelY); _mm_store_ps(target->velZ + i, myVelZ); } } }
float filter(float in, Filter *c) { float *a = c->a; float *b = c->b; int n = c->n; int nmax = c->nmax; float *x = c->x + nmax; float *y = c->y + nmax; float *xend = c->x; while(x != xend) { *(x) = *(x-1); *(y) = *(y-1); x--; y--; } *(x) = in; *(c->a) = 0.0; __m128 *mwork = (__m128*)c->work; __m128 *mworkend = (__m128*)c->workend; __m128 xb; __m128 ya; __m128* mb = (__m128*)b; __m128* ma = (__m128*)a; __m128* mx = (__m128*)x; __m128* my = (__m128*)y; while(mwork <= mworkend) { xb = _mm_mul_ps(*mb,*mx); ya = _mm_mul_ps(*ma,*my); *(mwork) = _mm_sub_ps(xb,ya); mwork++ ; mb++; ma++; mx++; my++; } float out = 0.0; float *work = c->work; float *workend = c->workend; while(work<=workend) { out += *(work); work++; } *(c->y) = out; *(c->a) = 1.0; return out; }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
/** * Return two random floats from 0 to 1 in positions 0 and 2. */ __m128 random_number2(){ mirand2 = _mm_mul_epu32(mirand2, _mm_set1_epi32(16807)); __m128i tmp; tmp = _mm_and_si128(mirand2, _mm_set1_epi32(0x007fffff)); tmp = _mm_or_si128(tmp, _mm_set1_epi32(0x3f800000)); __m128 ret = _mm_castsi128_ps(tmp); ret = _mm_sub_ps(ret, _mm_set1_ps(1)); return ret; }
void sub_mad(register float *dst, register float *src1, register float *src2, float c0, int w) { register int j = 0; #if CV_SSE if (CPU_SUPPORT_SSE1) { __m128 a, b, c; __m128 cnst = _mm_set_ps1(c0); for (; j < w - 3; j += 4) { a = _mm_loadu_ps(src1 + j); b = _mm_loadu_ps(src2 + j); b = _mm_mul_ps(b, a); c = _mm_loadu_ps(dst + j); c = _mm_sub_ps(c, cnst); c = _mm_sub_ps(c, b); _mm_storeu_ps(dst + j, c); } } #endif for (; j < w; j++) dst[j] -= src1[j] * src2[j] + c0; }
void convertCSSE(int num, uint8_t *in, float *out){ int i; __m128 sub = _mm_set1_ps(128.0); __m128 mul = _mm_set1_ps(1/128.0); for(i=0; i<num; i+=4){ __m128i val = _mm_loadu_si128((__m128i *)(in + i)); __m128i ints = _mm_cvtepu8_epi32(val); __m128 cvtd = _mm_cvtepi32_ps(ints); __m128 res = _mm_mul_ps(_mm_sub_ps(cvtd, sub), mul); _mm_storeu_ps(out + i, res); } }
// Apply the saturation component to the the pixel's values inline void ApplySaturation(__m128& pix, const __m128 saturation) { // Compute luma: dot product of pixel values and the luma weigths __m128 luma = _mm_mul_ps(pix, LumaWeights); // luma = [ x+y , y+x , z+w , w+z ] luma = _mm_add_ps(luma, _mm_shuffle_ps(luma, luma, _MM_SHUFFLE(2,3,0,1))); // luma = [ x+y+z+w , y+x+w+z , z+w+x+y , w+z+y+x ] luma = _mm_add_ps(luma, _mm_shuffle_ps(luma, luma, _MM_SHUFFLE(1,0,3,2))); // Apply saturation pix = _mm_add_ps(luma, _mm_mul_ps(saturation, _mm_sub_ps(pix, luma))); }
__inline __m128 vec_recip3(__m128 v) { #ifdef USE_MANUAL_CALLSTACK call_stack.enter("vec_recip3()"); #endif // obtain estimate __m128 estimate = _mm_rcp_ps( v ); // one round of Newton-Raphson __m128 rv=_mm_add_ps(_mm_mul_ps(_mm_sub_ps(ONE, _mm_mul_ps(estimate, v)), estimate), estimate); #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return rv; }
/** * Identify bends in the chain, where the kappa angle (virtual bond angle from * c-alpha i-2, to i, to i+2) is greater than 70 degrees * dssp-2.2.0/structure.cpp:1729 */ static std::vector<int> calculate_bends(const float* xyz, const int* ca_indices, const int* chain_ids, const int n_residues, std::vector<int>& skip) { __m128 prev_ca, this_ca, next_ca, u_prime, v_prime, u, v; float kappa; std::vector<int> is_bend(n_residues, 0); for (int i = 2; i < n_residues-2; i++) { if (chain_ids[i-2] == chain_ids[i+2] && !skip[i-2] && !skip[i] && !skip[i+2]) { prev_ca = load_float3(xyz + 3*ca_indices[i-2]); this_ca = load_float3(xyz + 3*ca_indices[i]); next_ca = load_float3(xyz + 3*ca_indices[i+2]); u_prime = _mm_sub_ps(prev_ca, this_ca); v_prime = _mm_sub_ps(this_ca, next_ca); /* normalize the vectors u_prime and v_prime */ u = _mm_div_ps(u_prime, _mm_sqrt_ps(_mm_dp_ps2(u_prime, u_prime, 0x7F))); v = _mm_div_ps(v_prime, _mm_sqrt_ps(_mm_dp_ps2(v_prime, v_prime, 0x7F))); /* compute the arccos of the dot product. this gives the angle */ kappa = (float) acos(CLIP(_mm_cvtss_f32(_mm_dp_ps2(u, v, 0x71)), -1, 1)); is_bend[i] = kappa > (70 * (M_PI / 180.0)); } } return is_bend; }
/* ==================== idMD5Mesh::CalculateBounds ==================== */ void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds ) const { __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; __m128 minZ = vector_float_posInfinity; __m128 maxX = vector_float_negInfinity; __m128 maxY = vector_float_negInfinity; __m128 maxZ = vector_float_negInfinity; for( int i = 0; i < numMeshJoints; i++ ) { const idJointMat& joint = entJoints[meshJoints[i]]; __m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 ); __m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 ); __m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 ); minX = _mm_min_ps( minX, x ); minY = _mm_min_ps( minY, y ); minZ = _mm_min_ps( minZ, z ); maxX = _mm_max_ps( maxX, x ); maxY = _mm_max_ps( maxY, y ); maxZ = _mm_max_ps( maxZ, z ); } __m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 ); minX = _mm_sub_ps( minX, expand ); minY = _mm_sub_ps( minY, expand ); minZ = _mm_sub_ps( minZ, expand ); maxX = _mm_add_ps( maxX, expand ); maxY = _mm_add_ps( maxY, expand ); maxZ = _mm_add_ps( maxZ, expand ); _mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); }
void nv_vector_sub(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); NV_ASSERT(vec2->n == vec0->n); #if NV_ENABLE_AVX { __m256 x; int n; int pk_lp = (vec1->n & 0xfffffff8); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n)); x = _mm256_sub_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n)); _mm256_store_ps(&NV_MAT_V(vec0, m0, n), x); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #elif NV_ENABLE_SSE2 { int n; int pk_lp = (vec1->n & 0xfffffffc); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); x = _mm_sub_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), x); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #endif }
static void FilterFarSSE2( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * (PART_LEN1); } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]); const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]); const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re); const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im); const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im); const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); const __m128 e = _mm_sub_ps(a, b); const __m128 f = _mm_add_ps(c, d); const __m128 g = _mm_add_ps(y_fft_re, e); const __m128 h = _mm_add_ps(y_fft_im, f); _mm_storeu_ps(&y_fft[0][j], g); _mm_storeu_ps(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); } } }
static inline __m128 lab_f_inv_m(const __m128 x) { const __m128 epsilon = _mm_set1_ps(0.20689655172413796f); // cbrtf(216.0f/24389.0f); const __m128 kappa_rcp_x16 = _mm_set1_ps(16.0f * 27.0f / 24389.0f); const __m128 kappa_rcp_x116 = _mm_set1_ps(116.0f * 27.0f / 24389.0f); // x > epsilon const __m128 res_big = _mm_mul_ps(_mm_mul_ps(x, x), x); // x <= epsilon const __m128 res_small = _mm_sub_ps(_mm_mul_ps(kappa_rcp_x116, x), kappa_rcp_x16); // blend results according to whether each component is > epsilon or not const __m128 mask = _mm_cmpgt_ps(x, epsilon); return _mm_or_ps(_mm_and_ps(mask, res_big), _mm_andnot_ps(mask, res_small)); }
static void negative_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = -(*src1++); } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; xmm0 = _mm_setzero_ps(); xmm1 = _mm_loadu_ps(src1); xmm0 = _mm_sub_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_setzero_ps(); xmm1 = _mm_loadu_ps(src1 + 4); xmm0 = _mm_sub_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { *dest++ = -(*src1++); } }
/** * SSE Implementation of \c cnsFormula (subroutine of cnsResponse). * \c scale, \c gaussI2 and \c regVar are 32bit floats (gaussI2 as A and B). * \c sobelX, \c sobelY, \c gaussI are signed short. * \c result is a packed vector of unsigned signed 8bit number with the x and y component * alternating and \c offset (unsigned char) added. */ ALWAYSINLINE static void cnsFormula(__m128i& result, __m128i sobelX, __m128i sobelY, __m128i& gaussI, const __m128& gaussI2A, const __m128& gaussI2B, const __m128& scale, const __m128& regVar, __m128i offset) { __m128 gaussIA = _mm_cvtepi32_ps(_mm_unpacklo_epi16(gaussI, _mm_setzero_si128())); __m128 gaussIB = _mm_cvtepi32_ps(_mm_unpackhi_epi16(gaussI, _mm_setzero_si128())); __m128 factorA = _mm_add_ps(_mm_sub_ps(gaussI2A, _mm_mul_ps(gaussIA, gaussIA)), regVar); // gaussI2-gaussI^2+regVar __m128 factorB = _mm_add_ps(_mm_sub_ps(gaussI2B, _mm_mul_ps(gaussIB, gaussIB)), regVar); factorA = _mm_mul_ps(_mm_rsqrt_ps(factorA), scale); // scale/sqrt(gaussI2-gaussI^2+regVar) factorB = _mm_mul_ps(_mm_rsqrt_ps(factorB), scale); // (2^-11)*sobelX*(scale/sqrt(gaussI2-gaussI^2+regVar)) __m128i factor = _mm_packs_epi32(_mm_cvtps_epi32(factorA), _mm_cvtps_epi32(factorB)); __m128i resultXepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelX, 5), factor); __m128i resultYepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelY, 5), factor); // Convert to 8bit and interleave X and Y // the second argument of packs duplicates values to higher bytes, but these are ignored later, unpacklo interleaves X and Y __m128i resultepi8 = _mm_unpacklo_epi8(_mm_packs_epi16(resultXepi16, resultXepi16), _mm_packs_epi16(resultYepi16, resultYepi16)); result = _mm_add_epi8(resultepi8, offset); // add offset, switching to epu8 }
il_vec4 il_vec4_sub(const il_vec4 a, const il_vec4 b, il_vec4 vec) { if (!vec) { vec = il_vec4_new(); } #ifdef IL_SSE _mm_store_ps(vec, _mm_sub_ps(_mm_load_ps(a), _mm_load_ps(b))); #else vec[0] = a[0] - b[0]; vec[1] = a[1] - b[1]; vec[2] = a[2] - b[2]; vec[3] = a[3] - b[3]; #endif return vec; }