static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi); const __m128 _ps_0p5 = _mm_set1_ps(0.5f); const __m128 _ps_1 = _mm_set1_ps(1.0f); const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f); const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f); const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f); const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f); const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f); const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f); const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f); const __m128i _pi32_1 = _mm_set1_epi32(1); const __m128i _pi32_i1 = _mm_set1_epi32(~1); const __m128i _pi32_2 = _mm_set1_epi32(2); const __m128i _pi32_4 = _mm_set1_epi32(4); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2; __m128i mmi0, mmi2, mmi4; __m128 x, y, z; __m128 y1, y2; __m128 a = _mm_set1_ps(rad); x = _mm_and_ps(a, _mask_sign_inv); y = _mm_mul_ps(x, _ps_fopi); mmi2 = _mm_cvtps_epi32(y); mmi2 = _mm_add_epi32(mmi2, _pi32_1); mmi2 = _mm_and_si128(mmi2, _pi32_i1); y = _mm_cvtepi32_ps(mmi2); mmi4 = mmi2; mmi0 = _mm_and_si128(mmi2, _pi32_4); mmi0 = _mm_slli_epi32(mmi0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0); mmi2 = _mm_and_si128(mmi2, _pi32_2); mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(mmi2); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3)); mmi4 = _mm_sub_epi32(mmi4, _pi32_2); mmi4 = _mm_andnot_si128(mmi4, _pi32_4); mmi4 = _mm_slli_epi32(mmi4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(mmi4); __m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin); z = _mm_mul_ps(x, x); y1 = _mm_mul_ps(_ps_coscof_p0, z); y1 = _mm_add_ps(y1, _ps_coscof_p1); y1 = _mm_mul_ps(y1, z); y1 = _mm_add_ps(y1, _ps_coscof_p2); y1 = _mm_mul_ps(y1, z); y1 = _mm_mul_ps(y1, z); y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5)); y1 = _mm_add_ps(y1, _ps_1); y2 = _mm_mul_ps(_ps_sincof_p0, z); y2 = _mm_add_ps(y2, _ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, _ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); __m128 sin1y = _mm_andnot_ps(poly_mask, y1); __m128 sin2y = _mm_and_ps(poly_mask, y2); mm1 = _mm_add_ps(sin1y, sin2y); mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y)); sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin)); cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos)); }
/* motion templates */ CV_IMPL void cvUpdateMotionHistory( const void* silhouette, void* mhimg, double timestamp, double mhi_duration ) { CvMat silhstub, *silh = cvGetMat(silhouette, &silhstub); CvMat mhistub, *mhi = cvGetMat(mhimg, &mhistub); if( !CV_IS_MASK_ARR( silh )) CV_Error( CV_StsBadMask, "" ); if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 ) CV_Error( CV_StsUnsupportedFormat, "" ); if( !CV_ARE_SIZES_EQ( mhi, silh )) CV_Error( CV_StsUnmatchedSizes, "" ); CvSize size = cvGetMatSize( mhi ); int mhi_step = mhi->step; int silh_step = silh->step; if( CV_IS_MAT_CONT( mhi->type & silh->type )) { size.width *= size.height; mhi_step = silh_step = CV_STUB_STEP; size.height = 1; } float ts = (float)timestamp; float delbound = (float)(timestamp - mhi_duration); int x, y; #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for( y = 0; y < size.height; y++ ) { const uchar* silhData = silh->data.ptr + silh->step*y; float* mhiData = (float*)(mhi->data.ptr + mhi->step*y); x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
void decomp_gamma3_minus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6; __m128 xmm7; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* sub */ xmm0 = _mm_sub_ps(xmm0, xmm3); xmm1 = _mm_sub_ps(xmm1, xmm4); xmm2 = _mm_sub_ps(xmm2, xmm5); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
/* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call InitializeFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4x(fft_type *buffer,HFFT h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br1Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad=2*M_PI/(2*h->Points); int ButterfliesPerGroup=h->Points/2; /* Massage input to get the input for a real output sequence. */ A=localBuffer+2; B=localBuffer+h->Points*2-2; br1Index=1; //h->BitReversed+1; int iSinCosCalIndex=0; while(A<B) { v4sfu sin4_2, cos4_2; if(useBitReverseTable) { br1Value=h->BitReversed[br1Index]; } else { br1Value=SmallReverseBits(br1Index,h->pow2Bits); } if(useSinCosTable) { sin=_mm_set1_ps(h->SinTable[br1Value]); cos=_mm_set1_ps(h->SinTable[br1Value+1]); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } } HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0]=v1; localBuffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1=localBuffer+h->Points*2; while(ButterfliesPerGroup>0) { A=localBuffer; B=localBuffer+ButterfliesPerGroup*2; sptr=h->SinTable; int iSinCosIndex=0; int iSinCosCalIndex=0; while(A<endptr1) { v4sfu sin4_2, cos4_2; if(useSinCosTable) { sin=_mm_set1_ps(*(sptr++)); cos=_mm_set1_ps(*(sptr++)); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } iSinCosIndex++; } endptr2=B; while(A<endptr2) { v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5)); *(A++)=_mm_sub_ps(*(B++), v1); *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5)); *(A++)=_mm_sub_ps(*(B++),v2); } A=B; B=&B[ButterfliesPerGroup*2]; } ButterfliesPerGroup >>= 1; } }
/* * Bitwise NOT operation for reals */ inline __m128 not_ps(const __m128 x) { static const __m128i mask = _mm_set1_epi32(~0); return _mm_xor_ps(CAST_INT_TO_REAL_V(mask), x); }
inline vec4 operator!(vec4 a) { const unsigned i = 0xFFFFFFFF; return _mm_xor_ps(_mm_set1_ps(*(float*)&i), a); }
RETf XOR(const __m128 x, const __m128 y) { return _mm_xor_ps(x, y); }
inline GPR_t si_xori( GPR_t RA, int64_t IMM ) { return _mm_xor_ps( RA, _mm_castsi128_ps( _mm_set1_epi32((int32_t)IMM) ) ); }
inline GPR_t si_xor( GPR_t RA, GPR_t RB ) { return _mm_xor_ps( RA, RB ); }
void sincos_ps(__m128 x, __m128 *s, __m128 *c) { __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128i emm0, emm2, emm4; sign_bit_sin = x; x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask)); sign_bit_sin = _mm_and_ps(sign_bit_sin, *reinterpret_cast<const __m128*>(_pi_sign_mask)); y = _mm_mul_ps(x, *_ps_cephes_FOPI); emm2 = _mm_cvttps_epi32(y); emm2 = _mm_add_epi32(emm2, *_pi_1); emm2 = _mm_and_si128(emm2, *_pi_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; emm0 = _mm_and_si128(emm2, *_pi_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); emm2 = _mm_and_si128(emm2, *_pi_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); xmm1 = *_ps_minus_cephes_DP1; xmm2 = *_ps_minus_cephes_DP2; xmm3 = *_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, *_pi_2); emm4 = _mm_andnot_si128(emm4, *_pi_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); __m128 z = _mm_mul_ps(x, x); y = *_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *_ps_1); __m128 y2 = *_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1, ysin2); xmm2 = _mm_add_ps(y, y2); *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
inline GPR_t si_xorhi( GPR_t RA, int64_t IMM ) { return _mm_xor_ps( RA, _mm_castsi128_ps( _mm_set1_epi16((int16_t)IMM) ) ); }
static inline __m128 gen_zero(void) { volatile __m128 x; return _mm_xor_ps(x, x); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
inline Quaternion conjugate(const Quaternion& q) { return Quaternion(vec4(_mm_xor_ps(q.val.xmm, _mm_setr_ps(-0.f, -0.f, -0.f, 0.f)))); }
inline GPR_t si_xorbi( GPR_t RA, int64_t IMM ) { return _mm_xor_ps( RA, _mm_castsi128_ps( _mm_set1_epi8((uint8_t)IMM) ) ); }
inline vec4 operator^(vec4 a, vec4 b) { return _mm_xor_ps(a, b); }
real ls_rmsd2_aligned_T_g(const int nrealatoms, const int npaddedatoms, const int rowstride, const float* aT, const float* bT, const real G_a, const real G_b) { /* Structure setup for this function: * * structures are stored axis major, possibly with extra padding to ensure you * meet two constraints: * - the number of elements in a row must be a multiple of 4 * - the first element in each row must be aligned to a 16 byte boundary * * note that if you meet the second condition for the first row, and meet the * first condition, the alignment will automatically be satisfied for every row. * * the layout in memory for a structure of 7 atoms would look like this: * * x0 x1 x2 x3 x4 x5 x6 0 * y0 y1 y2 y3 y4 y5 y6 0 * z0 z1 z2 z3 z4 z5 z6 0 * * if your structure has a number of atoms that is not a multiple of 4, you must * pad it out to a multiple of 4 using zeros (using anything other than zero will * make the calculation go wrong). * * arguments: * nrealatoms: the *actual* number of atoms in the structure * * npaddedatoms: the number of atoms in the structure including padding atoms; * should equal nrealatoms rounded up to the next multiple of 4 * * rowstride: the offset in elements between rows in the arrays. will prob * be equal to npaddedatoms, but you might use something else if * (for example) you were subsetting the structure * * aT: pointer to start of first structure (A). should be aligned to * a 16-byte boundary * * bT: pointer to start of second structure (B). should be aligned to * a 16-byte boundary * * G_a: trace of A'A * * G_b: trace of B'B */ /* printf ("# theo rmsd: nreal = %d npadded = %d rowstride = %d Ga = %f Gb = %f\n", */ /* nrealatoms, npaddedatoms, rowstride, G_a, G_b); */ int nIndex; // Will have 3 garbage elements at the end float M[12] __attribute__ ((aligned (16))); const float* aTx = aT; const float* aTy = aT+rowstride; const float* aTz = aT+2*rowstride; const float* bTx = bT; const float* bTy = bT+rowstride; const float* bTz = bT+2*rowstride; // npaddedatoms must be a multiple of 4 int niters = npaddedatoms >> 2; __m128 xx,xy,xz,yx,yy,yz,zx,zy,zz; __m128 ax,ay,az,b; __m128 t0,t1,t2; // Prologue xx = _mm_xor_ps(xx,xx); xy = _mm_xor_ps(xy,xy); xz = _mm_xor_ps(xz,xz); yx = _mm_xor_ps(yx,yx); yy = _mm_xor_ps(yy,yy); yz = _mm_xor_ps(yz,yz); zx = _mm_xor_ps(zx,zx); zy = _mm_xor_ps(zy,zy); zz = _mm_xor_ps(zz,zz); for (int k = 0; k < niters; k++) { ax = _mm_load_ps(aTx); ay = _mm_load_ps(aTy); az = _mm_load_ps(aTz); b = _mm_load_ps(bTx); t0 = ax; t1 = ay; t2 = az; t0 = _mm_mul_ps(t0,b); t1 = _mm_mul_ps(t1,b); t2 = _mm_mul_ps(t2,b); xx = _mm_add_ps(xx,t0); yx = _mm_add_ps(yx,t1); zx = _mm_add_ps(zx,t2); b = _mm_load_ps(bTy); t0 = ax; t1 = ay; t2 = az; t0 = _mm_mul_ps(t0,b); t1 = _mm_mul_ps(t1,b); t2 = _mm_mul_ps(t2,b); xy = _mm_add_ps(xy,t0); yy = _mm_add_ps(yy,t1); zy = _mm_add_ps(zy,t2); b = _mm_load_ps(bTz); ax = _mm_mul_ps(ax,b); ay = _mm_mul_ps(ay,b); az = _mm_mul_ps(az,b); xz = _mm_add_ps(xz,ax); yz = _mm_add_ps(yz,ay); zz = _mm_add_ps(zz,az); aTx += 4; aTy += 4; aTz += 4; bTx += 4; bTy += 4; bTz += 4; } // Epilogue - reduce 4 wide vectors to one wide /*xmm07 = xx0 xx1 xx2 xx3 xmm08 = xy0 xy1 xy2 xy3 xmm09 = xz0 xz1 xz2 xz3 xmm10 = yx0 yx1 yx2 yx3 xmm11 = yy0 yy1 yy2 yy3 xmm12 = yz0 yz1 yz2 yz3 xmm13 = zx0 zx1 zx2 zx3 xmm14 = zy0 zy1 zy2 zy3 xmm15 = zz0 zz1 zz2 zz3 haddps xmm07 xmm08 xmm07 = xx0+1 xx2+3 xy0+1 xy2+3 haddps xmm09 xmm10 xmm09 = xz0+1 xz2+3 yx0+1 yx2+3 haddps xmm11 xmm12 xmm11 = yy0+1 yy2+3 yz0+1 yz2+3 haddps xmm13 xmm14 xmm13 = zx0+1 zx2+3 zy0+1 zy2+3 haddps xmm15 xmm14 xmm15 = zz0+1 zz2+3 zy0+1 zy2+3 haddps xmm07 xmm09 xmm07 = xx0123 xy0123 xz0123 yx0123 haddps xmm11 xmm13 xmm11 = yy0123 yz0123 zx0123 zy0123 haddps xmm15 xmm09 xmm15 = zz0123 zy0123 xz0123 yx0123*/ #ifdef __SSE3__ xx = _mm_hadd_ps(xx,xy); xz = _mm_hadd_ps(xz,yx); yy = _mm_hadd_ps(yy,yz); zx = _mm_hadd_ps(zx,zy); zz = _mm_hadd_ps(zz,zy); xx = _mm_hadd_ps(xx,xz); yy = _mm_hadd_ps(yy,zx); zz = _mm_hadd_ps(zz,xz); #else // Emulate horizontal adds using UNPCKLPS/UNPCKHPS t0 = xx; t1 = xx; t0 = _mm_unpacklo_ps(t0,xz); // = xx0 xz0 xx1 xz1 t1 = _mm_unpackhi_ps(t1,xz); // = xx2 xz2 xx3 xz3 t0 = _mm_add_ps(t0,t1); // = xx02 xz02 xx13 xz13 t1 = xy; t2 = xy; t1 = _mm_unpacklo_ps(t1,yx); // = xy0 yx0 xy1 yx1 t2 = _mm_unpackhi_ps(t2,yx); // = xy2 yx2 xy3 yx3 t1 = _mm_add_ps(t1,t2); // = xy02 yx02 xy13 yx13 xx = t0; xx = _mm_unpacklo_ps(xx,t1); // = xx02 xy02 xz02 yx02 t0 = _mm_unpackhi_ps(t0,t1); // = xx13 xy13 xz13 yx13 xx = _mm_add_ps(xx,t0); // = xx0123 xy0123 xz0123 yx0123 t0 = yy; t1 = yy; t0 = _mm_unpacklo_ps(t0,zx); // = yy0 zx0 yy1 zx1 t1 = _mm_unpackhi_ps(t1,zx); // = yy2 zx2 yy3 zx3 t0 = _mm_add_ps(t0,t1); // = yy02 zx02 yy13 zx13 t1 = yz; t2 = yz; t1 = _mm_unpacklo_ps(t1,zy); // = yz0 zy0 yz1 zy1 t2 = _mm_unpackhi_ps(t2,zy); // = yz2 zy2 yz3 zy3 t1 = _mm_add_ps(t1,t2); // = yz02 zy02 yz13 zy13 yy = t0; yy = _mm_unpacklo_ps(yy,t1); // = yy02 yz02 zx02 zy02 t0 = _mm_unpackhi_ps(t0,t1); // = yy13 yz13 zx13 zy13 yy = _mm_add_ps(yy,t0); // = yy0123 yz0123 zx0123 zy0123 t1 = _mm_movehl_ps(t1,zz); // = zz2 zz3 - - zz = _mm_add_ps(zz,t1); // = zz02 zz13 - - t1 = _mm_shuffle_ps(zz,zz,_MM_SHUFFLE(1,1,1,1)); // = zz13 zz13 zz13 zz13 zz = _mm_add_ps(zz,t1); // = zz0123 zz1133 - - #endif _mm_store_ps(M , xx); _mm_store_ps(M+4, yy); _mm_store_ps(M+8, zz); return rmsd2FromMandG(M,G_a,G_b,nrealatoms); }
_declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) { if (options.ignoreColor) { makeGreyscale(left); makeGreyscale(right); } float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16); int colorOffset = left.width * left.height; Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 }; float* drp = diff.r; float* dgp = diff.g; float* dbp = diff.b; float* dap = diff.a; float* lrp = left.r; float* lgp = left.g; float* lbp = left.b; float* lap = left.a; float* rrp = right.r; float* rgp = right.g; float* rbp = right.b; float* rap = right.a; Color error = ConvertToFloat(options.errorColor); auto er = _mm_set_ps1(error.r); auto eg = _mm_set_ps1(error.g); auto eb = _mm_set_ps1(error.b); auto ea = _mm_set_ps1(error.a); auto tolerance = _mm_set_ps1(options.tolerance); auto overlayTransparency = _mm_set_ps1(options.overlayTransparency); OverlayType overlayType = options.overlayType; byte weightByDiffPercentage = options.weightByDiffPercentage; auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0); auto onei = _mm_set1_epi32(1); auto one = _mm_set1_ps(1); auto zero = _mm_set1_ps(0); for (int y = 0; y < left.height; y++) { for (int x = 0; x < left.width; x+=4) { auto lr = _mm_load_ps(lrp); auto lg = _mm_load_ps(lgp); auto lb = _mm_load_ps(lbp); auto la = _mm_load_ps(lap); auto rr = _mm_load_ps(rrp); auto rg = _mm_load_ps(rgp); auto rb = _mm_load_ps(rbp); auto ra = _mm_load_ps(rap); auto rdiff = _mm_sub_ps(rr, lr); auto gdiff = _mm_sub_ps(rg, lg); auto bdiff = _mm_sub_ps(rb, lb); auto adiff = _mm_sub_ps(ra, la); auto distance = _mm_mul_ps(rdiff, rdiff); distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff)); distance = _mm_sqrt_ps(distance); auto t = overlayTransparency; if (weightByDiffPercentage) { t = _mm_mul_ps(t, distance); } auto isdiff = _mm_cmpgt_ps(distance, tolerance); t = _mm_min_ps(one, _mm_max_ps(zero, t)); auto mlr = rr; auto mlg = rg; auto mlb = rb; auto mla = ra; if (overlayType == OverlayType::Movement) { mlr = _mm_mul_ps(mlr, er); mlg = _mm_mul_ps(mlg, eg); mlb = _mm_mul_ps(mlb, eb); mla = _mm_mul_ps(mla, ea); } auto oneMinusT = _mm_sub_ps(one, t); auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t)); auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t)); auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t)); auto mixedA = one; if (overlayType != OverlayType::Movement) { mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t)); } // (((b ^ a) & mask)^a) auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr))); auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg))); auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb))); auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la))); diffPixelCount = _mm_xor_si128(diffPixelCount, _mm_and_si128(_mm_castps_si128(isdiff), _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei), diffPixelCount))); _mm_store_ps(drp, dr); _mm_store_ps(dgp, dg); _mm_store_ps(dbp, db); _mm_store_ps(dap, da); drp+=4; dgp+=4; dbp+=4; dap+=4; lrp+=4; lgp+=4; lbp+=4; lap+=4; rrp+=4; rgp+=4; rbp+=4; rap+=4; } } int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16); _mm_store_si128((__m128i*)pixelCounts, diffPixelCount); int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3]; _aligned_free(pixelCounts); return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) }; }
void intrin_sse_mult_su3_mat_vec(su3_matrixf *aa, su3_vectorf* bb, su3_vectorf* cc) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5; xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->c[1]) ); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->c[2]) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 ); xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) ); xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm3 = _mm_mul_ps( xmm3, xmm0 ); xmm4 = _mm_mul_ps( xmm4, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 ); xmm0 = _mm_xor_ps( xmm0, _sse_sgn13.xmm ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 ); xmm1 = _mm_xor_ps( xmm1, _sse_sgn13.xmm ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn13.xmm ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm4 = _mm_mul_ps( xmm4, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); _mm_storeu_ps((float *)&((cc)->c[0]), xmm3 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) ); xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm6 = _mm_add_ps( xmm6, xmm5 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn4.xmm ); xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm6 = _mm_add_ps( xmm6, xmm7 ); xmm7 = xmm6 ; xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE ); xmm6 = _mm_add_ps( xmm6, xmm7 ); _mm_storel_pi((__m64 *)&((cc)->c[2]), xmm6 ); }
void RealFFTf4x(fft_type *buffer,HFFT h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad=2*M_PI/(2*h->Points); int ButterfliesPerGroup=h->Points/2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1=&localBuffer[h->Points*2]; while(ButterfliesPerGroup>0) { A=localBuffer; B=&localBuffer[ButterfliesPerGroup*2]; sptr=h->SinTable; int iSinCosIndex=0; int iSinCosCalIndex=0; while(A<endptr1) { v4sfu sin4_2, cos4_2; if(useSinCosTable) { sin=_mm_set1_ps(*(sptr++)); cos=_mm_set1_ps(*(sptr++)); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } iSinCosIndex++; } endptr2=B; while(A<endptr2) { v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B=_mm_add_ps( *A, v1); __m128 temp128 = _mm_set1_ps( 2.0); *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1)); *B=_mm_sub_ps(*A,v2); *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2)); } A=B; B=&B[ButterfliesPerGroup*2]; } ButterfliesPerGroup >>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index=1; // h->BitReversed+1; br2Index=h->Points-1; //h->BitReversed+h->Points-1; int iSinCosCalIndex=0; while(br1Index<br2Index) { v4sfu sin4_2, cos4_2; if(useBitReverseTable) { br1Value=h->BitReversed[br1Index]; br2Value=h->BitReversed[br2Index]; } else { br1Value=SmallReverseBits(br1Index,h->pow2Bits); br2Value=SmallReverseBits(br2Index,h->pow2Bits); } if(useSinCosTable) { sin=_mm_set1_ps(h->SinTable[br1Value]); cos=_mm_set1_ps(h->SinTable[br1Value+1]); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } } A=&localBuffer[br1Value]; B=&localBuffer[br2Value]; __m128 temp128 = _mm_set1_ps( 2.0); HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); temp128 = _mm_set1_ps( 0.5); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); *(B+1) = _mm_sub_ps(*(A+1), HIminus); br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ if(useBitReverseTable) A=&localBuffer[h->BitReversed[br1Index]+1]; else A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; }
__m128 t4(void) { return _mm_xor_ps (magic_a,magic_b); }
/* * Bitwise NOT operation for integers */ inline __m128i not_si128(const __m128i x) { static const __m128i mask = _mm_set1_epi32(~0); return CAST_REAL_TO_INT_V(_mm_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x))); }
/** N/stage point generic N stage butterfly (in place, 2 register) */ static void mdct_butterfly_generic_sse(MDCTContext *mdct, FLOAT *x, int points, int trigint) { float *T; float *x1 = x + points - 8; float *x2 = x + (points>>1) - 8; switch (trigint) { default : T = mdct->trig; do { __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6; XMM0 = _mm_load_ps(x1 ); XMM1 = _mm_load_ps(x2 ); XMM2 = _mm_load_ps(x1+4); XMM3 = _mm_load_ps(x2+4); XMM4 = XMM0; XMM5 = XMM2; XMM0 = _mm_sub_ps(XMM0, XMM1); XMM2 = _mm_sub_ps(XMM2, XMM3); XMM4 = _mm_add_ps(XMM4, XMM1); XMM5 = _mm_add_ps(XMM5, XMM3); XMM1 = XMM0; XMM3 = XMM2; _mm_store_ps(x1 , XMM4); _mm_store_ps(x1+4, XMM5); XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(3,3,1,1)); XMM1 = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(2,2,0,0)); XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(3,3,1,1)); XMM3 = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(2,2,0,0)); XMM4 = _mm_load_ps(T+trigint*3); XMM5 = _mm_load_ps(T+trigint*3); XMM6 = _mm_load_ps(T+trigint*2); XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v); XMM4 = _mm_shuffle_ps(XMM4, XMM6, _MM_SHUFFLE(0,1,0,1)); XMM5 = _mm_shuffle_ps(XMM5, XMM6, _MM_SHUFFLE(1,0,1,0)); XMM0 = _mm_mul_ps(XMM0, XMM4); XMM1 = _mm_mul_ps(XMM1, XMM5); XMM4 = _mm_load_ps(T+trigint ); XMM5 = _mm_load_ps(T+trigint ); XMM6 = _mm_load_ps(T ); XMM3 = _mm_xor_ps(XMM3, PCS_RNRN.v); XMM4 = _mm_shuffle_ps(XMM4, XMM6, _MM_SHUFFLE(0,1,0,1)); XMM5 = _mm_shuffle_ps(XMM5, XMM6, _MM_SHUFFLE(1,0,1,0)); XMM2 = _mm_mul_ps(XMM2, XMM4); XMM3 = _mm_mul_ps(XMM3, XMM5); XMM0 = _mm_add_ps(XMM0, XMM1); XMM2 = _mm_add_ps(XMM2, XMM3); _mm_store_ps(x2 , XMM0); _mm_store_ps(x2+4, XMM2); T += trigint*4; x1 -= 8; x2 -= 8; } while (x2>=x); return; case 8: T = mdct->trig_butterfly_generic8; break; case 16: T = mdct->trig_butterfly_generic16; break; case 32: T = mdct->trig_butterfly_generic32; break; case 64: T = mdct->trig_butterfly_generic64; break; } _mm_prefetch((char*)T , _MM_HINT_NTA); do { __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7; _mm_prefetch((char*)(T+16), _MM_HINT_NTA); XMM0 = _mm_load_ps(x1 ); XMM1 = _mm_load_ps(x2 ); XMM2 = _mm_load_ps(x1+4); XMM3 = _mm_load_ps(x2+4); XMM4 = XMM0; XMM5 = XMM2; XMM0 = _mm_sub_ps(XMM0, XMM1); XMM2 = _mm_sub_ps(XMM2, XMM3); XMM4 = _mm_add_ps(XMM4, XMM1); XMM5 = _mm_add_ps(XMM5, XMM3); XMM1 = XMM0; XMM3 = XMM2; XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(3,3,1,1)); XMM1 = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(2,2,0,0)); _mm_store_ps(x1 , XMM4); _mm_store_ps(x1+4, XMM5); XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(3,3,1,1)); XMM3 = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(2,2,0,0)); XMM4 = _mm_load_ps(T ); XMM5 = _mm_load_ps(T+ 4); XMM6 = _mm_load_ps(T+ 8); XMM7 = _mm_load_ps(T+12); XMM0 = _mm_mul_ps(XMM0, XMM4); XMM1 = _mm_mul_ps(XMM1, XMM5); XMM2 = _mm_mul_ps(XMM2, XMM6); XMM3 = _mm_mul_ps(XMM3, XMM7); XMM0 = _mm_add_ps(XMM0, XMM1); XMM2 = _mm_add_ps(XMM2, XMM3); _mm_store_ps(x2 , XMM0); _mm_store_ps(x2+4, XMM2); T += 16; x1 -= 8; x2 -= 8; } while (x2 >= x); }
void cv::updateMotionHistory( InputArray _silhouette, InputOutputArray _mhi, double timestamp, double duration ) { CV_Assert( _silhouette.type() == CV_8UC1 && _mhi.type() == CV_32FC1 ); CV_Assert( _silhouette.sameSize(_mhi) ); float ts = (float)timestamp; float delbound = (float)(timestamp - duration); CV_OCL_RUN(_mhi.isUMat() && _mhi.dims() <= 2, ocl_updateMotionHistory(_silhouette, _mhi, ts, delbound)) Mat silh = _silhouette.getMat(), mhi = _mhi.getMat(); Size size = silh.size(); #ifdef HAVE_IPP int silhstep = (int)silh.step, mhistep = (int)mhi.step; #endif if( silh.isContinuous() && mhi.isContinuous() ) { size.width *= size.height; size.height = 1; #ifdef HAVE_IPP silhstep = (int)silh.total(); mhistep = (int)mhi.total() * sizeof(Ipp32f); #endif } #ifdef HAVE_IPP IppStatus status = ippiUpdateMotionHistory_8u32f_C1IR((const Ipp8u *)silh.data, silhstep, (Ipp32f *)mhi.data, mhistep, ippiSize(size.width, size.height), (Ipp32f)timestamp, (Ipp32f)duration); if (status >= 0) return; #endif #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for(int y = 0; y < size.height; y++ ) { const uchar* silhData = silh.ptr<uchar>(y); float* mhiData = mhi.ptr<float>(y); int x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
static void mdct_bitreverse_sse(MDCTContext *mdct, FLOAT *x) { int n = mdct->n; int *bit = mdct->bitrev; float *w0 = x; float *w1 = x = w0+(n>>1); float *T = mdct->trig_bitreverse; do { float *x0 = x+bit[0]; float *x1 = x+bit[1]; float *x2 = x+bit[2]; float *x3 = x+bit[3]; __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7; w1 -= 4; XMM0 = _mm_lddqu_ps(x0); XMM1 = _mm_lddqu_ps(x1); XMM4 = _mm_lddqu_ps(x2); XMM7 = _mm_lddqu_ps(x3); XMM2 = XMM0; XMM3 = XMM1; XMM5 = XMM0; XMM6 = XMM1; XMM0 = _mm_shuffle_ps(XMM0, XMM4, _MM_SHUFFLE(0,1,0,1)); XMM1 = _mm_shuffle_ps(XMM1, XMM7, _MM_SHUFFLE(0,1,0,1)); XMM2 = _mm_shuffle_ps(XMM2, XMM4, _MM_SHUFFLE(0,0,0,0)); XMM3 = _mm_shuffle_ps(XMM3, XMM7, _MM_SHUFFLE(0,0,0,0)); XMM5 = _mm_shuffle_ps(XMM5, XMM4, _MM_SHUFFLE(1,1,1,1)); XMM6 = _mm_shuffle_ps(XMM6, XMM7, _MM_SHUFFLE(1,1,1,1)); XMM4 = _mm_load_ps(T ); XMM7 = _mm_load_ps(T+4); XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v); XMM2 = _mm_add_ps(XMM2, XMM3); XMM5 = _mm_sub_ps(XMM5, XMM6); XMM0 = _mm_add_ps(XMM0, XMM1); XMM2 = _mm_mul_ps(XMM2, XMM4); XMM5 = _mm_mul_ps(XMM5, XMM7); XMM0 = _mm_mul_ps(XMM0, PFV_0P5.v); XMM2 = _mm_add_ps(XMM2, XMM5); XMM1 = XMM0; XMM3 = XMM2; XMM1 = _mm_xor_ps(XMM1, PCS_RNRN.v); XMM3 = _mm_xor_ps(XMM3, PCS_RNRN.v); XMM0 = _mm_add_ps(XMM0, XMM2); XMM1 = _mm_sub_ps(XMM1, XMM3); _mm_store_ps(w0, XMM0); _mm_storeh_pi((__m64*)(w1 ), XMM1); _mm_storel_pi((__m64*)(w1+2), XMM1); T += 8; bit += 4; w0 += 4; } while (w0 < w1); }
void decomp_gamma0_minus( spinor_array src, halfspinor_array dst) { /* c <-> color, s <-> spin */ /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); /* Swap the lower components and multiply by -i*/ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0x1b); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0x1b); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0x1b); xmm9 = _mm_xor_ps(xmm6, signs24.vector); xmm10 = _mm_xor_ps(xmm7, signs24.vector); xmm11 = _mm_xor_ps(xmm8, signs24.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
btVector3 btConvexShape::localGetSupportVertexWithoutMarginNonVirtual (const btVector3& localDir) const { switch (m_shapeType) { case SPHERE_SHAPE_PROXYTYPE: { return btVector3(0,0,0); } case BOX_SHAPE_PROXYTYPE: { btBoxShape* convexShape = (btBoxShape*)this; const btVector3& halfExtents = convexShape->getImplicitShapeDimensions(); #if defined( __APPLE__ ) && (defined( BT_USE_SSE )||defined( BT_USE_NEON )) #if defined( BT_USE_SSE ) return btVector3( _mm_xor_ps( _mm_and_ps( localDir.mVec128, (__m128){-0.0f, -0.0f, -0.0f, -0.0f }), halfExtents.mVec128 )); #elif defined( BT_USE_NEON ) return btVector3( (float32x4_t) (((uint32x4_t) localDir.mVec128 & (uint32x4_t){ 0x80000000, 0x80000000, 0x80000000, 0x80000000}) ^ (uint32x4_t) halfExtents.mVec128 )); #else #error unknown vector arch #endif #else return btVector3(btFsels(localDir.x(), halfExtents.x(), -halfExtents.x()), btFsels(localDir.y(), halfExtents.y(), -halfExtents.y()), btFsels(localDir.z(), halfExtents.z(), -halfExtents.z())); #endif } case TRIANGLE_SHAPE_PROXYTYPE: { btTriangleShape* triangleShape = (btTriangleShape*)this; btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ()); btVector3* vertices = &triangleShape->m_vertices1[0]; btVector3 dots = dir.dot3(vertices[0], vertices[1], vertices[2]); btVector3 sup = vertices[dots.maxAxis()]; return btVector3(sup.getX(),sup.getY(),sup.getZ()); } case CYLINDER_SHAPE_PROXYTYPE: { btCylinderShape* cylShape = (btCylinderShape*)this; //mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis) btVector3 halfExtents = cylShape->getImplicitShapeDimensions(); btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ()); int cylinderUpAxis = cylShape->getUpAxis(); int XX(1),YY(0),ZZ(2); switch (cylinderUpAxis) { case 0: { XX = 1; YY = 0; ZZ = 2; } break; case 1: { XX = 0; YY = 1; ZZ = 2; } break; case 2: { XX = 0; YY = 2; ZZ = 1; } break; default: btAssert(0); break; }; btScalar radius = halfExtents[XX]; btScalar halfHeight = halfExtents[cylinderUpAxis]; btVector3 tmp; btScalar d ; btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]); if (s != btScalar(0.0)) { d = radius / s; tmp[XX] = v[XX] * d; tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight; tmp[ZZ] = v[ZZ] * d; return btVector3(tmp.getX(),tmp.getY(),tmp.getZ()); } else { tmp[XX] = radius; tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight; tmp[ZZ] = btScalar(0.0); return btVector3(tmp.getX(),tmp.getY(),tmp.getZ()); } } case CAPSULE_SHAPE_PROXYTYPE: { btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ()); btCapsuleShape* capsuleShape = (btCapsuleShape*)this; btScalar halfHeight = capsuleShape->getHalfHeight(); int capsuleUpAxis = capsuleShape->getUpAxis(); btScalar radius = capsuleShape->getRadius(); btVector3 supVec(0,0,0); btScalar maxDot(btScalar(-BT_LARGE_FLOAT)); btVector3 vec = vec0; btScalar lenSqr = vec.length2(); if (lenSqr < btScalar(0.0001)) { vec.setValue(1,0,0); } else { btScalar rlen = btScalar(1.) / btSqrt(lenSqr ); vec *= rlen; } btVector3 vtx; btScalar newDot; { btVector3 pos(0,0,0); pos[capsuleUpAxis] = halfHeight; //vtx = pos +vec*(radius); vtx = pos +vec*(radius) - vec * capsuleShape->getMarginNV(); newDot = vec.dot(vtx); if (newDot > maxDot) { maxDot = newDot; supVec = vtx; } } { btVector3 pos(0,0,0); pos[capsuleUpAxis] = -halfHeight; //vtx = pos +vec*(radius); vtx = pos +vec*(radius) - vec * capsuleShape->getMarginNV(); newDot = vec.dot(vtx); if (newDot > maxDot) { maxDot = newDot; supVec = vtx; } } return btVector3(supVec.getX(),supVec.getY(),supVec.getZ()); } case CONVEX_POINT_CLOUD_SHAPE_PROXYTYPE: { btConvexPointCloudShape* convexPointCloudShape = (btConvexPointCloudShape*)this; btVector3* points = convexPointCloudShape->getUnscaledPoints (); int numPoints = convexPointCloudShape->getNumPoints (); return convexHullSupport (localDir, points, numPoints,convexPointCloudShape->getLocalScalingNV()); } case CONVEX_HULL_SHAPE_PROXYTYPE: { btConvexHullShape* convexHullShape = (btConvexHullShape*)this; btVector3* points = convexHullShape->getUnscaledPoints(); int numPoints = convexHullShape->getNumPoints (); return convexHullSupport (localDir, points, numPoints,convexHullShape->getLocalScalingNV()); } default: #ifndef __SPU__ return this->localGetSupportingVertexWithoutMargin (localDir); #else btAssert (0); #endif } // should never reach here btAssert (0); return btVector3 (btScalar(0.0f), btScalar(0.0f), btScalar(0.0f)); }
void decomp_gamma2_plus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* Swap the lower components */ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1); xmm9 = _mm_xor_ps(xmm6, signs14.vector); xmm10 = _mm_xor_ps(xmm7, signs14.vector); xmm11 = _mm_xor_ps(xmm8, signs14.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
void fht_SSE2(FLOAT * fz, int n) { const FLOAT *tri = costab; int k4; FLOAT *fi, *gi; FLOAT const *fn; n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */ fn = fz + n; k4 = 4; do { FLOAT s1, c1; int i, k1, k2, k3, kx; kx = k4 >> 1; k1 = k4; k2 = k4 << 1; k3 = k2 + k1; k4 = k2 << 1; fi = fz; gi = fi + kx; do { FLOAT f0, f1, f2, f3; f1 = fi[0] - fi[k1]; f0 = fi[0] + fi[k1]; f3 = fi[k2] - fi[k3]; f2 = fi[k2] + fi[k3]; fi[k2] = f0 - f2; fi[0] = f0 + f2; fi[k3] = f1 - f3; fi[k1] = f1 + f3; f1 = gi[0] - gi[k1]; f0 = gi[0] + gi[k1]; f3 = SQRT2 * gi[k3]; f2 = SQRT2 * gi[k2]; gi[k2] = f0 - f2; gi[0] = f0 + f2; gi[k3] = f1 - f3; gi[k1] = f1 + f3; gi += k4; fi += k4; } while (fi < fn); c1 = tri[0]; s1 = tri[1]; for (i = 1; i < kx; i++) { __m128 v_s2; __m128 v_c2; __m128 v_c1; __m128 v_s1; FLOAT c2, s2, s1_2 = s1+s1; c2 = 1 - s1_2 * s1; s2 = s1_2 * c1; fi = fz + i; gi = fz + k1 - i; v_c1 = _mm_set_ps1(c1); v_s1 = _mm_set_ps1(s1); v_c2 = _mm_set_ps1(c2); v_s2 = _mm_set_ps1(s2); { static const vecfloat_union sign_mask = {{0x80000000,0,0,0}}; v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */ } { static const vecfloat_union sign_mask = {{0,0x80000000,0,0}}; v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */ } { static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}}; v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */ } do { __m128 p, q, r; q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/ p = _mm_mul_ps(_mm_set_ps1(s2), q); /* P := s2 * Q */ q = _mm_mul_ps(v_c2, q); /* Q := c2 * Q */ q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2)); /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */ p = _mm_add_ps(p, q); r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]); /* R := {gi_0,gi_k2,fi_0,fi_k2} */ q = _mm_sub_ps(r, p); /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */ r = _mm_add_ps(r, p); /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */ p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0)); /* P := {q0,q2,r0,r2} */ p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0)); /* P := {q0,r0,q2,r2} */ q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1)); /* Q := {q1,q3,r1,r3} */ r = _mm_mul_ps(v_c1, q); q = _mm_mul_ps(v_s1, q); q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3)); /* Q := {q3,q2,q1,q0} */ q = _mm_add_ps(q, r); store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]); store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]); gi += k4; fi += k4; } while (fi < fn); c2 = c1; c1 = c2 * tri[0] - s1 * tri[1]; s1 = c2 * tri[1] + s1 * tri[0]; } tri += 2; } while (k4 < n); }