__m128 test_loadl_pi(__m128 x, void* y) { // CHECK: define {{.*}} @test_loadl_pi // CHECK: load <2 x float>* {{.*}}, align 1{{$}} // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1 // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3> return _mm_loadl_pi(x,y); }
void intrin_sse_add_su3_vector(su3_vectorf *aa, su3_vectorf *bb, su3_vectorf *cc) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1; xmm0 = _mm_loadu_ps((float *)&((aa)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((aa)->c[2]) ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_loadu_ps((float *)&((bb)->c[0]) ); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&((bb)->c[2]) ); xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x44 ); xmm0 = _mm_add_ps( xmm0, xmm2 ); xmm1 = _mm_add_ps( xmm1, xmm3 ); _mm_storeu_ps((float *)&((cc)->c[0]), xmm0 ); _mm_storel_pi((__m64 *)&((cc)->c[2]), xmm1 ); }
void intrin_sse_scalar_mult_add_su3_vector(su3_vectorf* aa, su3_vectorf* bb, float cc, su3_vectorf* dd) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm4; xmm4 = _mm_load_ss((float *)&((cc)) ); xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 ); xmm0 = _mm_loadu_ps((float *)&((aa)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((aa)->c[2]) ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_loadu_ps((float *)&((bb)->c[0]) ); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&((bb)->c[2]) ); xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x44 ); xmm2 = _mm_mul_ps( xmm2, xmm4 ); xmm3 = _mm_mul_ps( xmm3, xmm4 ); xmm0 = _mm_add_ps( xmm0, xmm2 ); xmm1 = _mm_add_ps( xmm1, xmm3 ); _mm_storeu_ps((float *)&((dd)->c[0]), xmm0 ); _mm_storel_pi((__m64 *)&((dd)->c[2]), xmm1 ); }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
void decomp_gamma3_plus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6; __m128 xmm7; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* sub */ xmm0 = _mm_add_ps(xmm0, xmm3); xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
// Calculates bounding rectagnle of a point set or retrieves already calculated static Rect pointSetBoundingRect( const Mat& points ) { int npoints = points.checkVector(2); int depth = points.depth(); CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S)); int xmin = 0, ymin = 0, xmax = -1, ymax = -1, i; bool is_float = depth == CV_32F; if( npoints == 0 ) return Rect(); const Point* pts = (const Point*)points.data; Point pt = pts[0]; #if CV_SSE4_2 if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) { if( !is_float ) { __m128i minval, maxval; minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y for( i = 1; i < npoints; i++ ) { __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]); minval = _mm_min_epi32(ptXY, minval); maxval = _mm_max_epi32(ptXY, maxval); } xmin = _mm_cvtsi128_si32(minval); ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); xmax = _mm_cvtsi128_si32(maxval); ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); } else { __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); for( i = 1; i < npoints; i++ ) { ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]); minvalf = _mm_min_ps(minvalf, ptXY); maxvalf = _mm_max_ps(maxvalf, ptXY); } float xyminf[2], xymaxf[2]; _mm_storel_pi((__m64*)xyminf, minvalf); _mm_storel_pi((__m64*)xymaxf, maxvalf); xmin = cvFloor(xyminf[0]); ymin = cvFloor(xyminf[1]); xmax = cvFloor(xymaxf[0]); ymax = cvFloor(xymaxf[1]); } } else #endif { if( !is_float ) { xmin = xmax = pt.x; ymin = ymax = pt.y; for( i = 1; i < npoints; i++ ) { pt = pts[i]; if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } } else { Cv32suf v; // init values xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = CV_TOGGLE_FLT(pt.y); for( i = 1; i < npoints; i++ ) { pt = pts[i]; pt.x = CV_TOGGLE_FLT(pt.x); pt.y = CV_TOGGLE_FLT(pt.y); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); // because right and bottom sides of the bounding rectangle are not inclusive // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } } return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); }
void decomp_gamma2_plus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* Swap the lower components */ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1); xmm9 = _mm_xor_ps(xmm6, signs14.vector); xmm10 = _mm_xor_ps(xmm7, signs14.vector); xmm11 = _mm_xor_ps(xmm8, signs14.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
void intrin_sse_mult_su3_mat_vec(su3_matrixf *aa, su3_vectorf* bb, su3_vectorf* cc) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5; xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->c[1]) ); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->c[2]) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 ); xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) ); xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm3 = _mm_mul_ps( xmm3, xmm0 ); xmm4 = _mm_mul_ps( xmm4, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 ); xmm0 = _mm_xor_ps( xmm0, _sse_sgn13.xmm ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 ); xmm1 = _mm_xor_ps( xmm1, _sse_sgn13.xmm ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn13.xmm ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm4 = _mm_mul_ps( xmm4, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); _mm_storeu_ps((float *)&((cc)->c[0]), xmm3 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) ); xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm6 = _mm_add_ps( xmm6, xmm5 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn4.xmm ); xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm6 = _mm_add_ps( xmm6, xmm7 ); xmm7 = xmm6 ; xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE ); xmm6 = _mm_add_ps( xmm6, xmm7 ); _mm_storel_pi((__m64 *)&((cc)->c[2]), xmm6 ); }
static inline int sacIsSampleDegenerate(PROSAC_HEST* p){ unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3]; /** * Pack the matches selected by the SAC algorithm. * Must be packed points[0:7] = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3} * points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3} * Gather 4 points into the vector */ __m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]); src10 = _mm_loadh_pi(src10, (__m64*)&p->src[i1]); __m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]); src32 = _mm_loadh_pi(src32, (__m64*)&p->src[i3]); __m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]); dst10 = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]); __m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]); dst32 = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]); /** * If the matches' source points have common x and y coordinates, abort. */ /** * Check: * packedPoints[0].x == packedPoints[2].x * packedPoints[0].y == packedPoints[2].y * packedPoints[1].x == packedPoints[3].x * packedPoints[1].y == packedPoints[3].y */ __m128 chkEq0 = _mm_cmpeq_ps(src10, src32); /** * Check: * packedPoints[1].x == packedPoints[2].x * packedPoints[1].y == packedPoints[2].y * packedPoints[0].x == packedPoints[3].x * packedPoints[0].y == packedPoints[3].y */ __m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32); /** * Check: * packedPoints[0].x == packedPoints[1].x * packedPoints[0].y == packedPoints[1].y * packedPoints[2].x == packedPoints[3].x * packedPoints[2].y == packedPoints[3].y */ __m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2))); /* Verify */ if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){ return 1; } /* If the matches do not satisfy the strong geometric constraint, abort. */ /** * p6420x = (p6.x, p4.x, p2.x, p0.x) * p6420y = (p6.y, p4.y, p2.y, p0.y) * p7531x = (p7.x, p5.x, p3.x, p1.x) * p7531y = (p7.y, p5.y, p3.y, p1.y) * crosssd0 = p6420y - p7531y = (cross2d0, cross0d0, cross2s0, cross0s0) * crosssd1 = p7531x - p6420x = (cross2d1, cross0d1, cross2s1, cross0s1) * crosssd2 = p6420x * p7531y - p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2) * * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0) * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1) * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2) * * dotsd0 = shufcrosssd0 * p6420x + * shufcrosssd1 * p6420y + * shufcrosssd2 * = (dotd0, dotd2, dots0, dots2) * dotsd1 = shufcrosssd0 * p7531x + * shufcrosssd1 * p7531y + * shufcrosssd2 * = (dotd1, dotd3, dots1, dots3) * * dots = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0)) * dotd = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2)) * movmaskps(dots ^ dotd) */ __m128 p3210x = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p3210y = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7654x = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7654y = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p6420x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p6420y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7531x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7531y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1)); __m128 crosssd0 = _mm_sub_ps(p6420y, p7531y); __m128 crosssd1 = _mm_sub_ps(p7531x, p6420x); __m128 crosssd2 = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x)); __m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1)); __m128 dotsd0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x), _mm_mul_ps(shufcrosssd1, p6420y)), shufcrosssd2); __m128 dotsd1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x), _mm_mul_ps(shufcrosssd1, p7531y)), shufcrosssd2); __m128 dots = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1)); __m128 dotd = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3)); //if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){ if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){ return 1; } /* Otherwise, proceed with evaluation */ _mm_store_ps((float*)&p->pkdPts[0], src10); _mm_store_ps((float*)&p->pkdPts[2], src32); _mm_store_ps((float*)&p->pkdPts[4], dst10); _mm_store_ps((float*)&p->pkdPts[6], dst32); return 0; }
LXC_ERROR_CODE LXC_SSE3FreqSplit2Ch(uint Size, void *Z, void *X, void *Y) { if(!Size || !Z || !X || !Y) { return LXC_ERR_INVALID_INPUT; } float *m_X = (float*)X; float *m_Y = (float*)Y; float *m_Z = (float*)Z; Size = Size*2; #if defined(TARGET_WINDOWS) const __declspec(align(LXC_SSE3_ALIGN)) float scaleFactor = 0.5f; #else const float scaleFactor = 0.5f; #endif __m128 scale_05 = _mm_load1_ps(&scaleFactor); __m128 XY0 = _mm_setr_ps(m_Z[0], 0.0f, m_Z[1], 0.0f); // [0]=Z[0][0], [1]=0.0f, [2]=Z[0][1], [3]=0.0f __m128 _m128Z = _mm_load_ps(&m_Z[0]); __m128 _m128Z_Size = _mm_loadl_pi(_m128Z, (__m64*)&m_Z[Size-2]); // [0]=Z[Size-1][0], [1]=Z[Size-1][1], [2]=Z[1][0], [3]=Z[1][1] __m128 leftNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(3,2,0,3)); // [0]=Z[1][1], [1]=Z[1][0], [2]=Z[Size-1][0], [3]=Z[1][1] __m128 rightNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(1,0,2,1)); // [0]=Z[Size-1][1], [1]=Z[Size-1][0], [2]=Z[1][0], [3]=Z[Size-1][1] __m128 mulAddSubRes = _mm_mul_ps(_mm_addsub_ps(leftNumbers, rightNumbers), scale_05); // [0]=(Z[1][1] - Z[Size-1][1])*0.5f=X[1][1] // [1]=(Z[1][0] + Z[Size-1][0])*0.5f=X[1][0] // [2]=(Z[Size-1][0] - Z[1][0])*0.5f=Y[1][1] // [3]=(Z[1][1] + Z[Size-1][1])*0.5f=Y[1][0] _mm_store_ps(&m_X[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(0,1,1,0))); // [0]=X[0][0]=Z[0][0] // [1]=X[0][1]=0.0f // [2]=X[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f // [3]=X[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f _mm_store_ps(&m_Y[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(2,3,3,2))); // [0]=Y[0][0]=Z[0][1] // [1]=Y[0][1]=0.0f // [2]=Y[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f // [3]=Y[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f for(uint kk = 4; kk < Size; kk+=4) { //__m128 _Z = {0.0f,1.0f,2.0f,3.0f}; //__m128 L = {4.0f,5.0f,6.0f,7.0f}; __m128 _Z = _mm_load_ps(&m_Z[kk]); // [0]=Z[kk][0], [1]=Z[kk][1], [2]=Z[kk+1][0], [3]=Z[kk+1][1] __m128 _ZShuffle = _mm_shuffle_ps(_Z, _Z, LXC_MM_SHUFFLE(1,0,3,2)); // [0]=Z[kk][1], [1]=Z[kk][0], [2]=Z[kk+1][1], [3]=Z[kk+1][0] __m128 _ZSize = _mm_loadu_ps(&(m_Z[Size - kk - 2])); // [0]=Z[Size-kk-1][0], [1]=Z[Size-kk-1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk][1] // calculate X signal __m128 _ZSizeShuffle = _mm_shuffle_ps(_ZSize, _ZSize, LXC_MM_SHUFFLE(3,2,1,0)); // [0]=Z[Size-kk][1], [1]=Z[Size-kk][0], [2]=Z[Size-kk-1][1], [3]=Z[Size-kk-1][0] __m128 result = _mm_mul_ps(_mm_addsub_ps(_ZShuffle, _ZSizeShuffle), scale_05); // [0]=(Z[kk][1] - Z[Size-kk][1])*0.5f=X[kk][1] // [1]=(Z[kk][0] + Z[Size-kk][0])*0.5f=X[kk][0] // [0]=(Z[kk+1][1] - Z[Size-kk-1][1])*0.5f=X[kk+1][1] // [1]=(Z[kk+1][0] + Z[Size-kk-1][0])*0.5f=X[kk+1][0] _mm_store_ps(&m_X[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2))); // [0]=X[kk][0] =(Z[kk][0] + Z[Size-kk][0])*0.5f // [1]=X[kk][1] =Z[kk][1] - Z[Size-kk][1])*0.5f // [2]=X[kk+1][0]=(Z[kk+1][1] + Z[Size-kk-1][1])*0.5f // [3]=X[kk+1][1]=(Z[Size-kk-1][0] - Z[kk+1][0])*0.5f // calculate Y signal __m128 left = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(1,3,2,0)); // [0]=Z[kk][1], [1]=Z[kk+1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk-1][0] left = _mm_shuffle_ps(left, left, LXC_MM_SHUFFLE(2,0,3,1)); // [0]=Z[Size-kk][0], [1]=Z[kk][1], [2]=Z[Size-kk-1][0], [3]=Z[kk+1][1] __m128 right = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(0,2,3,1)); // [0]=Z[kk][0], [1]=Z[kk+1][0], [2]=Z[Size-kk][1], [3]=Z[Size-kk-1][1] right = _mm_shuffle_ps(right, right, LXC_MM_SHUFFLE(0,2,1,3)); // [0]=Z[kk][0], [1]=Z[Size-kk][1], [2]=Z[kk+1][0], [3]=Z[Size-kk-1][1] result = _mm_mul_ps(_mm_addsub_ps(left, right), scale_05); // [0]=Y[kk][1] = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]); // [1]=Y[kk][0] = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]); // [2]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]); // [3]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]); _mm_store_ps(&m_Y[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2))); // [0]=Y[kk][0] = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]); // [1]=Y[kk][1] = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]); // [2]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]); // [3]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]); } return LXC_NO_ERR; }
// Does inverse according to Cramers Rule // See ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const { f32 *src = (f32*)&mat; __m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps(); __m128 row0=_mm_setzero_ps(), row1=_mm_setzero_ps(), row2=_mm_setzero_ps(), row3=_mm_setzero_ps(); __m128 det=_mm_setzero_ps(), tmp1=_mm_setzero_ps(); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); _mm_store_ss(&detv, det); Mat44 t; if(out) { src = (f32*)out->mat; } else { src = t.mat; } minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); };
M_Matrix44 M_MatrixInvert44_SSE(M_Matrix44 A) { M_Matrix44 Ainv; float *src = &A.m[0][0]; float *dst = &Ainv.m[0][0]; __m128 minor0, minor1, minor2, minor3; __m128 row0, row1, row2, row3; __m128 det, tmp1; tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)), (__m64 *)(src+4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)), (__m64 *)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)), (__m64 *)(src+6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)), (__m64 *)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64 *)(dst), minor0); _mm_storeh_pi((__m64 *)(dst+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64 *)(dst+4), minor1); _mm_storeh_pi((__m64 *)(dst+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64 *)(dst+8), minor2); _mm_storeh_pi((__m64 *)(dst+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64 *)(dst+12), minor3); _mm_storeh_pi((__m64 *)(dst+14), minor3); return (Ainv); }
void mpeg2_idct_add_sse2(int,int16_t* block, uint8_t* dest, const int stride) { __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2); idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7); __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_load_si128(&src0); __m128i r1 = _mm_load_si128(&src1); __m128i r2 = _mm_load_si128(&src2); __m128i r3 = _mm_load_si128(&src3); __m128i r4 = _mm_load_si128(&src4); __m128i r5 = _mm_load_si128(&src5); __m128i r6 = _mm_load_si128(&src6); __m128i r7 = _mm_load_si128(&src7); __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]); __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]); __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]); __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]); __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]); __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]); __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]); __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]); r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero)); r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero)); r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero)); r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero)); r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero)); r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero)); r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero)); r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero)); r0 = _mm_packus_epi16(r0, r1); r1 = _mm_packus_epi16(r2, r3); r2 = _mm_packus_epi16(r4, r5); r3 = _mm_packus_epi16(r6, r7); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src0, zero); _mm_store_si128(&src1, zero); _mm_store_si128(&src2, zero); _mm_store_si128(&src3, zero); _mm_store_si128(&src4, zero); _mm_store_si128(&src5, zero); _mm_store_si128(&src6, zero); _mm_store_si128(&src7, zero); }
/* Calculates bounding rectagnle of a point set or retrieves already calculated */ CV_IMPL CvRect cvBoundingRect( CvArr* array, int update ) { CvSeqReader reader; CvRect rect = { 0, 0, 0, 0 }; CvContour contour_header; CvSeq* ptseq = 0; CvSeqBlock block; CvMat stub, *mat = 0; int xmin = 0, ymin = 0, xmax = -1, ymax = -1, i, j, k; int calculate = update; if( CV_IS_SEQ( array )) { ptseq = (CvSeq*)array; if( !CV_IS_SEQ_POINT_SET( ptseq )) CV_Error( CV_StsBadArg, "Unsupported sequence type" ); if( ptseq->header_size < (int)sizeof(CvContour)) { update = 0; calculate = 1; } } else { mat = cvGetMat( array, &stub ); if( CV_MAT_TYPE(mat->type) == CV_32SC2 || CV_MAT_TYPE(mat->type) == CV_32FC2 ) { ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block); mat = 0; } else if( CV_MAT_TYPE(mat->type) != CV_8UC1 && CV_MAT_TYPE(mat->type) != CV_8SC1 ) CV_Error( CV_StsUnsupportedFormat, "The image/matrix format is not supported by the function" ); update = 0; calculate = 1; } if( !calculate ) return ((CvContour*)ptseq)->rect; if( mat ) { CvSize size = cvGetMatSize(mat); xmin = size.width; ymin = -1; for( i = 0; i < size.height; i++ ) { uchar* _ptr = mat->data.ptr + i*mat->step; uchar* ptr = (uchar*)cvAlignPtr(_ptr, 4); int have_nz = 0, k_min, offset = (int)(ptr - _ptr); j = 0; offset = MIN(offset, size.width); for( ; j < offset; j++ ) if( _ptr[j] ) { have_nz = 1; break; } if( j < offset ) { if( j < xmin ) xmin = j; if( j > xmax ) xmax = j; } if( offset < size.width ) { xmin -= offset; xmax -= offset; size.width -= offset; j = 0; for( ; j <= xmin - 4; j += 4 ) if( *((int*)(ptr+j)) ) break; for( ; j < xmin; j++ ) if( ptr[j] ) { xmin = j; if( j > xmax ) xmax = j; have_nz = 1; break; } k_min = MAX(j-1, xmax); k = size.width - 1; for( ; k > k_min && (k&3) != 3; k-- ) if( ptr[k] ) break; if( k > k_min && (k&3) == 3 ) { for( ; k > k_min+3; k -= 4 ) if( *((int*)(ptr+k-3)) ) break; } for( ; k > k_min; k-- ) if( ptr[k] ) { xmax = k; have_nz = 1; break; } if( !have_nz ) { j &= ~3; for( ; j <= k - 3; j += 4 ) if( *((int*)(ptr+j)) ) break; for( ; j <= k; j++ ) if( ptr[j] ) { have_nz = 1; break; } } xmin += offset; xmax += offset; size.width += offset; } if( have_nz ) { if( ymin < 0 ) ymin = i; ymax = i; } } if( xmin >= size.width ) xmin = ymin = 0; } else if( ptseq->total ) { int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; cvStartReadSeq( ptseq, &reader, 0 ); CvPoint pt; CV_READ_SEQ_ELEM( pt, reader ); #if CV_SSE4_2 if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) { if( !is_float ) { __m128i minval, maxval; minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y for( i = 1; i < ptseq->total; i++) { __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr)); CV_NEXT_SEQ_ELEM(sizeof(pt), reader); minval = _mm_min_epi32(ptXY, minval); maxval = _mm_max_epi32(ptXY, maxval); } xmin = _mm_cvtsi128_si32(minval); ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); xmax = _mm_cvtsi128_si32(maxval); ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); } else { __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); for( i = 1; i < ptseq->total; i++ ) { ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr); CV_NEXT_SEQ_ELEM(sizeof(pt), reader); minvalf = _mm_min_ps(minvalf, ptXY); maxvalf = _mm_max_ps(maxvalf, ptXY); } float xyminf[2], xymaxf[2]; _mm_storel_pi((__m64*)xyminf, minvalf); _mm_storel_pi((__m64*)xymaxf, maxvalf); xmin = cvFloor(xyminf[0]); ymin = cvFloor(xyminf[1]); xmax = cvFloor(xymaxf[0]); ymax = cvFloor(xymaxf[1]); } } else #endif { if( !is_float ) { xmin = xmax = pt.x; ymin = ymax = pt.y; for( i = 1; i < ptseq->total; i++ ) { CV_READ_SEQ_ELEM( pt, reader ); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } } else { Cv32suf v; // init values xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = CV_TOGGLE_FLT(pt.y); for( i = 1; i < ptseq->total; i++ ) { CV_READ_SEQ_ELEM( pt, reader ); pt.x = CV_TOGGLE_FLT(pt.x); pt.y = CV_TOGGLE_FLT(pt.y); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); // because right and bottom sides of the bounding rectangle are not inclusive // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } } rect.x = xmin; rect.y = ymin; rect.width = xmax - xmin + 1; rect.height = ymax - ymin + 1; } if( update ) ((CvContour*)ptseq)->rect = rect; return rect; }
void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride) { idct_M128ASM(block); /* for(int i = 0; i < 8; i++) { dest[0] = CLIP(block[0] + dest[0]); dest[1] = CLIP(block[1] + dest[1]); dest[2] = CLIP(block[2] + dest[2]); dest[3] = CLIP(block[3] + dest[3]); dest[4] = CLIP(block[4] + dest[4]); dest[5] = CLIP(block[5] + dest[5]); dest[6] = CLIP(block[6] + dest[6]); dest[7] = CLIP(block[7] + dest[7]); memset(block, 0, sizeof(short)*8); dest += stride; block += 8; } */ __m128i* src = (__m128i*)block; __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_load_si128(&src[0]); __m128i r1 = _mm_load_si128(&src[1]); __m128i r2 = _mm_load_si128(&src[2]); __m128i r3 = _mm_load_si128(&src[3]); __m128i r4 = _mm_load_si128(&src[4]); __m128i r5 = _mm_load_si128(&src[5]); __m128i r6 = _mm_load_si128(&src[6]); __m128i r7 = _mm_load_si128(&src[7]); __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]); __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]); __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]); __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]); __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]); __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]); __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]); __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]); r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero)); r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero)); r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero)); r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero)); r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero)); r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero)); r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero)); r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero)); r0 = _mm_packus_epi16(r0, r1); r1 = _mm_packus_epi16(r2, r3); r2 = _mm_packus_epi16(r4, r5); r3 = _mm_packus_epi16(r6, r7); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src[0], zero); _mm_store_si128(&src[1], zero); _mm_store_si128(&src[2], zero); _mm_store_si128(&src[3], zero); _mm_store_si128(&src[4], zero); _mm_store_si128(&src[5], zero); _mm_store_si128(&src[6], zero); _mm_store_si128(&src[7], zero); }
static inline double calc_output_single (SINC_FILTER *filter, const increment_t increment, const increment_t start_filter_index) { #ifdef RESAMPLER_SSE_OPT __m128i increment4; __m128 left128,right128; float left,right; #else double left,right; #endif const coeff_t * const __restrict coeffs = filter->coeffs; const float * const __restrict buffer = filter->buffer; increment_t filter_index, max_filter_index ; int data_index, coeff_count; /* Convert input parameters into fixed point. */ max_filter_index = int_to_fp (filter->coeff_half_len) ; /* First apply the left half of the filter. */ filter_index = start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current - coeff_count ; #ifdef RESAMPLER_SSE_OPT increment4 = _mm_set_epi32(increment * 3, increment * 2, increment, 0); left128 = _mm_setzero_ps(); while(filter_index >= increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff,_mm_mul_ps(icoeffd, fraction)); left128 = _mm_add_ps(left128,_mm_mul_ps(icoeff, _mm_loadu_ps(buffer + data_index))); data_index += 4; filter_index -= increment * 4; } #endif left = 0.; while (filter_index >= MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); left += icoeff * buffer[data_index]; filter_index -= increment; data_index++; } /* Now apply the right half of the filter. */ filter_index = increment - start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current + 1 + coeff_count ; #ifdef RESAMPLER_SSE_OPT right128 = _mm_setzero_ps(); while (filter_index > increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction,data; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff, _mm_mul_ps(icoeffd, fraction)); data = _mm_loadu_ps(buffer + (data_index - 3)); right128 = _mm_add_ps(right128,_mm_mul_ps(icoeff, _mm_shuffle_ps(data,data,_MM_SHUFFLE(0,1,2,3)))); data_index -= 4; filter_index -= increment * 4; } #endif right = 0.; while (filter_index > MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); right += icoeff * buffer[data_index]; filter_index -= increment; data_index--; } return ( #ifdef RESAMPLER_SSE_OPT _mm_cvtss_f32(horizontal_add(left128)) + _mm_cvtss_f32(horizontal_add(right128)) + #endif left + right) ; } /* calc_output_single */
// from intel Matrix4x4SSE &Matrix4x4SSE::Invert(void) { float *src = &m_Vec0[0]; __m128 minor0, minor1, minor2, minor3; __m128 det; // fool compiler only.. __m128 tmp1 = m_Vec0.m_Vec; __m128 row0 = m_Vec0.m_Vec; __m128 row1 = m_Vec1.m_Vec; __m128 row2 = m_Vec2.m_Vec; __m128 row3 = m_Vec3.m_Vec; tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); return *this; }
// Inverts a 4x4 matrix and returns the determinate inline float invert_44_matrix(float* src) { // Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix" // by Intel. // ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf __m128 minor0; __m128 minor1; __m128 minor2; __m128 minor3; __m128 row0; __m128 row1; __m128 row2; __m128 row3; __m128 det; __m128 tmp1; tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); return det[0]; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
void swizzle (const void *a, vector4_t * b, vector4_t * c) { b->v = _mm_loadl_pi (b->v, (__m64 *) a); c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1); }
ibMtx4& ibMtx4::Invert() { f32* src = &data.a[0][0]; __m128 minor0, minor1, minor2, minor3; __m128 row0, row1, row2, row3; __m128 det, tmp1; #if !defined NDEBUG || defined STATIC // Suppress RTC error for uninit vars f32 init = 0.f; row3 = row1 = tmp1 = _mm_load_ps1( &init ); #endif // NDEBUG tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); return *this; }
// function that implements the kernel of the seismic modeling algorithm void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps) { int i,j; // spatial loops counters int t; // time loop counter #ifdef _VERBOSE int progressTimer = -1; #endif // make sure packing _all_ the data into sets of 4 element is ok assert( xDim % 4 == 0 ); #ifdef _VERBOSE printf("processing...\n"); printf("point of explosion = %d, %d\n", spPosX, spPosY); #endif // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty; __m128 s_above2, s_under2, s_left2, s_right2; float two[4] = {2.0f, 2.0f, 2.0f, 2.0f }; float sixteen[4] = {16.0f,16.0f,16.0f,16.0f}; float sixty[4] = {60.f,60.f,60.f,60.f}; // preload XMM registers with constant values. s_two = _mm_load_ps( two ); s_sixteen = _mm_load_ps( sixteen ); s_sixty = _mm_load_ps( sixty ); // time loop for (t = 0; t < timeSteps; t++) { #ifdef _VVERBOSE printf("----------------------------------------------\ntimestep: %d\n\n", t ); #endif // add pulse APF[spPosY][spPosX] += seismicPulseVector[t]; for(i=2; i<(yDim-2); i++) { for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4) { s_ppf = _mm_load_ps( &(PPF[i][j]) ); s_vel = _mm_load_ps( &(VEL[i][j]) ); s_actual = _mm_load_ps( &(APF[i][j]) ); s_left1 = _mm_load_ps( &(APF[i-1][j]) ); s_left2 = _mm_load_ps( &(APF[i-2][j]) ); s_right2 = _mm_load_ps( &(APF[i+2][j]) ); s_right1 = _mm_load_ps( &(APF[i+1][j]) ); s_above1 = _mm_loadu_ps( &(APF[i][j-1]) ); s_under1 = _mm_loadu_ps( &(APF[i][j+1]) ); s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)), &(APF[i][j-2])); s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)), &(APF[i][j+4])); // sum elements with an offset of one s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1))); // sum elements with an offset of two s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2))); // multiply with 16 s_under1 = _mm_mul_ps( s_sixteen, s_under1 ); // s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1, s_above2), _mm_mul_ps( s_sixty, s_actual ) ); s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) ); // save the result _mm_store_ps( &(NPF[i][j]), s_under1); #ifdef _VVERBOSE printf("[%d][%d]\n", i, j); #endif } #ifdef _VVERBOSE printf("\n"); #endif } #ifdef _VERBOSE // shows one # at each 10% of the total processing time if (t/(timeSteps/10) > progressTimer ) { printf("#"); progressTimer++; fflush(stdout); } #endif // switch pointers instead of copying data PPF = APF; APF = NPF; NPF = PPF; } #ifdef _VERBOSE printf("\nend process!\n"); #endif }