void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha) { __m128 s = _mm_set1_ps(alpha); __m128 v1234 = _mm_loadu_ps(u.data()); __m128 v56xx = _mm_loadu_ps(u.data() + 4); __m128 v1212 = _mm_movelh_ps(v1234, v1234); __m128 v3434 = _mm_movehl_ps(v1234, v1234); __m128 v5656 = _mm_movelh_ps(v56xx, v56xx); __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212)); _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212))); _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434))); _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656))); __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434)); _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434))); _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656))); __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656)); _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656))); }
void bumps_t::initialize ( const bump_specifier_t & b0, const bump_specifier_t & b1) { // Precompute the coefficients of four cubic polynomials in t, giving // the two smoothstep regions of the each of the two bump functions. v4f b0t = load4f (& b0.t0); // b0.t0 b0.t1 b0.t2 b0.t2 v4f b1t = load4f (& b1.t0); // b1.t0 b1.t1 b1.t2 b1.t2 v4f b0v = _mm_movelh_ps (load4f (& b0.v0), _mm_setzero_ps ()); // b0.v0 b0.v1 v4f b1v = _mm_movelh_ps (load4f (& b1.v0), _mm_setzero_ps ()); // b1.v0 b1.v1 v4f S = SHUFPS (b0t, b1t, (0, 2, 0, 2)); // b0.t0 b0.t2 b1.t0 b1.t2 v4f T = SHUFPS (b0t, b1t, (1, 3, 1, 3)); // b0.t1 b0.t3 b1.t1 b1.t3 v4f U = SHUFPS (b0v, b1v, (0, 2, 0, 2)); // b0.v0 0 b1.v0 0 v4f V1 = SHUFPS (b0v, b1v, (1, 0, 1, 0)); // b0.v1 b0.v0 b1.v1 b1.v0 v4f V2 = SHUFPS (b0v, b1v, (2, 1, 2, 1)); // 0 b0.v1 0 b1.v1 v4f V = V1 - V2; v4f d = T - S; v4f a = T + S; v4f m = (V - U) / (d * d * d); store4f (c [0], U + m * S * S * (a + d + d)); store4f (c [1], _mm_set1_ps (-6.0f) * m * S * T); store4f (c [2], _mm_set1_ps (+3.0f) * m * a); store4f (c [3], _mm_set1_ps (-2.0f) * m); store4f (S0, S); store4f (T0, T); store4f (U0, U); store4f (V0, V); }
/* apparently this is retarded */ void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2) { /* for some reason not aligning the matrix segfaults, * but aligning deadlocks the program */ /* aha we can heavily sse this: * 1. transpose mat2 * 2. dotproduct the rows */ /* 1. transpose mat2 */ __m128 row0, row1, row2, row3; __m128 tmp0, tmp1, tmp2, tmp3; /* Load 4x4 mat2 from memory into four SSE registers. */ row0 = _mm_load_ps( mat2[0] ); row1 = _mm_load_ps( mat2[1] ); row2 = _mm_load_ps( mat2[2] ); row3 = _mm_load_ps( mat2[3] ); /* Interleave bottom/top two pixels from two SSE registers with each other * into a single SSE register. */ tmp0 = _mm_unpacklo_ps( row0, row1 ); tmp2 = _mm_unpacklo_ps( row2, row3 ); tmp1 = _mm_unpackhi_ps( row0, row1 ); tmp3 = _mm_unpackhi_ps( row2, row3 ); /* Move bottom/top two pixels from two SSE registers into one SSE register. */ row0 = _mm_movelh_ps( tmp0, tmp2 ); row1 = _mm_movehl_ps( tmp2, tmp0 ); row2 = _mm_movelh_ps( tmp1, tmp3 ); row3 = _mm_movehl_ps( tmp3, tmp1 ); /* Store 4x4 matrix from all four SSE registers into memory. */ _mm_store_ps( mat2[0], row0 ); _mm_store_ps( mat2[1], row1 ); _mm_store_ps( mat2[2], row2 ); _mm_store_ps( mat2[3], row3 ); /* 2. dotproduct the rows */ /* OMG 16 DOT PRODUCTS */ ret[0][0] = mul_asm(mat1[0], mat2[0]); ret[0][1] = mul_asm(mat1[0], mat2[1]); ret[0][2] = mul_asm(mat1[0], mat2[2]); ret[0][3] = mul_asm(mat1[0], mat2[3]); ret[1][0] = mul_asm(mat1[1], mat2[0]); ret[1][1] = mul_asm(mat1[1], mat2[1]); ret[1][2] = mul_asm(mat1[1], mat2[2]); ret[1][3] = mul_asm(mat1[1], mat2[3]); ret[2][0] = mul_asm(mat1[2], mat2[0]); ret[2][1] = mul_asm(mat1[2], mat2[1]); ret[2][2] = mul_asm(mat1[2], mat2[2]); ret[2][3] = mul_asm(mat1[2], mat2[3]); ret[3][0] = mul_asm(mat1[3], mat2[0]); ret[3][1] = mul_asm(mat1[3], mat2[1]); ret[3][2] = mul_asm(mat1[3], mat2[2]); ret[3][3] = mul_asm(mat1[3], mat2[3]); return; }
/// Transform this box using the specified transform matrix. /// /// @param[in] rTransform Matrix by which to transform. void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform ) { // Expand each corner position. Register minVec = m_minimum.GetSimdVector(); Register maxVec = m_maximum.GetSimdVector(); Vector3Soa corners0; corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ); corners0.m_z = _mm_unpackhi_ps( minVec, maxVec ); corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z ); Vector3Soa corners1; corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners1.m_y = corners0.m_y; corners1.m_z = corners0.m_z; // Transform all corners by the provided transformation matrix. Matrix44Soa transformSplat( rTransform ); transformSplat.TransformPoint( corners0, corners0 ); transformSplat.TransformPoint( corners1, corners1 ); // Compute the minimum. Register minX = Simd::MinF32( corners0.m_x, corners1.m_x ); Register minY = Simd::MinF32( corners0.m_y, corners1.m_y ); Register minXYLo = _mm_unpacklo_ps( minX, minY ); Register minXYHi = _mm_unpackhi_ps( minX, minY ); Register minXY = Simd::MinF32( minXYLo, minXYHi ); Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z ); Register minZLo = _mm_unpacklo_ps( minZ, minZ ); Register minZHi = _mm_unpackhi_ps( minZ, minZ ); minZ = Simd::MinF32( minZLo, minZHi ); Register minLo = _mm_movelh_ps( minXY, minZ ); Register minHi = _mm_movehl_ps( minZ, minXY ); m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) ); // Compute the maximum. Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x ); Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y ); Register maxXYLo = _mm_unpacklo_ps( maxX, maxY ); Register maxXYHi = _mm_unpackhi_ps( maxX, maxY ); Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi ); Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z ); Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ ); Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ ); maxZ = Simd::MaxF32( maxZLo, maxZHi ); Register maxLo = _mm_movelh_ps( maxXY, maxZ ); Register maxHi = _mm_movehl_ps( maxZ, maxXY ); m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) ); }
float calcCubicNoiseValSSE(const vec3 p) { int ix, iy, iz; __m128 fx, fy; float fz; ix = (int)floor(p[0]); fx = _mm_set_ps1(p[0] - ix); iy = (int)floor(p[1]); fy = _mm_set_ps1(p[1] - iy); iz = (int)floor(p[2]); fz = p[2] - iz; uSIMD k0, k1, k2, k3; __m128 out0, out1, out2, out3; for(int k = -1; k <= 2; k++) { for(int j = -1; j <= 2; j++) { k0.a[j+1] = getLatticeVal(ix-1, iy + j, iz + k); k1.a[j+1] = getLatticeVal(ix+0, iy + j, iz + k); k2.a[j+1] = getLatticeVal(ix+1, iy + j, iz + k); k3.a[j+1] = getLatticeVal(ix+2, iy + j, iz + k); } switch(k) { case -1: out0 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 0: out1 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 1: out2 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 2: out3 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; } } // Transpose the matrix formed by the out vectors. __m128 t1 = _mm_movelh_ps(out1, out0); __m128 t2 = _mm_movehl_ps(out0, out1); __m128 t3 = _mm_movelh_ps(out3, out2); __m128 t4 = _mm_movehl_ps(out2, out3); k0.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(0, 2, 0, 2)); k1.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 3, 1, 3)); k2.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(0, 2, 0, 2)); k3.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 3, 1, 3)); uSIMD final_knots; final_knots.m = fourKnotSplineSSE(&fy, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); return clamp(fourKnotSpline(fz, final_knots.a), -1.0f, 1.0f); }
Quat MUST_USE_RESULT Quat::RotateFromTo(const float4 &sourceDirection, const float4 &targetDirection) { #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 12.289 nsecs / 33.144 ticks, Avg: 12.489 nsecs, Worst: 14.210 nsecs simd4f cosAngle = dot4_ps(sourceDirection.v, targetDirection.v); cosAngle = negate3_ps(cosAngle); // [+ - - -] // XYZ channels use the trigonometric formula sin(x/2) = +/-sqrt(0.5-0.5*cosx)) // The W channel uses the trigonometric formula cos(x/2) = +/-sqrt(0.5+0.5*cosx)) simd4f half = set1_ps(0.5f); simd4f cosSinHalfAngle = sqrt_ps(add_ps(half, mul_ps(half, cosAngle))); // [cos(x/2), sin(x/2), sin(x/2), sin(x/2)] simd4f axis = cross_ps(sourceDirection.v, targetDirection.v); simd4f recipLen = rsqrt_ps(dot4_ps(axis, axis)); axis = mul_ps(axis, recipLen); // [0 z y x] // Set the w component to one. simd4f one = add_ps(half, half); // [1 1 1 1] simd4f highPart = _mm_unpackhi_ps(axis, one); // [_ _ 1 z] axis = _mm_movelh_ps(axis, highPart); // [1 z y x] Quat q; q.q = mul_ps(axis, cosSinHalfAngle); return q; #else // Best: 19.970 nsecs / 53.632 ticks, Avg: 20.197 nsecs, Worst: 21.122 nsecs assume(EqualAbs(sourceDirection.w, 0.f)); assume(EqualAbs(targetDirection.w, 0.f)); return Quat::RotateFromTo(sourceDirection.xyz(), targetDirection.xyz()); #endif }
matrix4 matrix4::transposed() const { #ifdef __SSE__ __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps(x.v, y.v); tmp2 = _mm_unpacklo_ps(z.v, w.v); tmp1 = _mm_unpackhi_ps(x.v, y.v); tmp3 = _mm_unpackhi_ps(z.v, w.v); return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1)); #else return matrix4(float4(x.x, y.x, z.x, w.x), float4(x.y, y.y, z.y, w.y), float4(x.z, y.z, z.z, w.z), float4(x.w, y.w, z.w, w.w)); #endif }
/// Compute the corners of this view frustum. /// /// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight /// corners) or whether an infinite far clip plane is used (four corners). /// /// Note that this assumes that the frustum is always properly defined, with each possible combination of /// neighboring clip planes intersecting at a valid point. /// /// @param[out] pCorners Array in which the frustum corners will be stored. This must point to a region of memory /// large enough for four points if this frustum has an infinite far clip plane, or eight /// points if this frustum has a normal far clip plane. /// /// @return Number of clip planes computed (either four or eight). size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const { HELIUM_ASSERT( pCorners ); // Compute the corners in struct-of-arrays format. HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST; size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ ); HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 ); // Swizzle the results and store in the output array. Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX ); Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY ); Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ ); Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); if( cornerCount == 8 ) { cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 ); cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 ); cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 ); xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); } return cornerCount; }
void fast(element_t * const elements, const int num_elts, const float a) { element_t * elts = elements; float logf_a = logf(a); float logf_1_a = logf(1.0/a); v4sf log_a = _mm_load1_ps(&logf_a); v4sf log_1_a = _mm_load1_ps(&logf_1_a); assert(num_elts % 3 == 0); // operates on 3 elements at a time // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a); for (int i = 0; i < num_elts; i += 3) { // transpose // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0 v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0 v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0 v4sf r3 = _mm_setzero_ps(); // 0, 0, 0, 0 v4sf t0 = _mm_unpacklo_ps(r0, r1); // x1,x2,y1,y2 v4sf t1 = _mm_unpacklo_ps(r2, r3); // x3,0, y3,0 v4sf t2 = _mm_unpackhi_ps(r0, r1); // z1,z2,0, 0 v4sf t3 = _mm_unpackhi_ps(r2, r3); // z3,0, 0, 0 r0 = _mm_movelh_ps(t0, t1); // x1,x2,x3,0 r1 = _mm_movehl_ps(t1, t0); // y1,y2,y3,0 r2 = _mm_movelh_ps(t2, t3); // z1,z2,z3,0 // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a)) v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0 v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0 v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0 v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0 v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0 v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0 // sum v4sf s1 = _mm_add_ps(ex0, ex1); v4sf s2 = _mm_add_ps(sum1, ex2); // pow(sum, 1/a) = exp(sum * log(1/a)) v4sf ps = _mm_mul_ps(s2, log_1_a); v4sf es = exp_ps(ps); ALIGN16_BEG float re[4] ALIGN16_END; _mm_store_ps(re, es); elts[0].re = re[0]; elts[1].re = re[1]; elts[2].re = re[2]; elts += 3; } }
void matrix3_transpose(struct matrix3 *dst, const struct matrix3 *m) { __m128 tmp1, tmp2; vec3_transform(&dst->t, &m->t, m); vec3_neg(&dst->t, &dst->t); tmp1 = _mm_movelh_ps(m->x.m, m->y.m); tmp2 = _mm_movehl_ps(m->y.m, m->x.m); dst->x.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 0, 2, 0)); dst->y.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 1, 3, 1)); dst->z.m = _mm_shuffle_ps(tmp2, m->z.m, _MM_SHUFFLE(3, 2, 2, 0)); }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
/** transform vector by rigid transform */ inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec) { #ifdef SIMPLE_GL_USE_SSE4 __m128 res; __m128 dotProd; res = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\ dotProd = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); return Matrix<float, 4, 1>(res); #elif defined(SIMPLE_GL_USE_SSE3) __m128 res; __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); __m128 vec01 = _mm_unpacklo_ps(dotProd0, dotProd1); __m128 vec23 = _mm_unpackhi_ps(dotProd2, dotProd3); res = _mm_movelh_ps(vec01, vec23); return Matrix<float, 4, 1>(res); #else // SSE2 // TODO: Think about good sse optimization Matrix<float, 4, 1> res; res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3]; res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3]; res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3]; res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3]; return res; #endif }
inline vector4f haddp(const vector4f* row) { #if SSE_INSTR_SET >= 3 // SSE3 return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); #else __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); #endif }
inline __m128 CalcWeights(float x, float y) { __m128 ssx = _mm_set_ss(x); __m128 ssy = _mm_set_ss(y); __m128 psXY = _mm_unpacklo_ps(ssx, ssy); // 0 0 y x //__m128 psXYfloor = _mm_floor_ps(psXY); // use this line for if you have SSE4 __m128 psXYfloor = _mm_cvtepi32_ps(_mm_cvtps_epi32(psXY)); __m128 psXYfrac = _mm_sub_ps(psXY, psXYfloor); // = frac(psXY) __m128 psXYfrac1 = _mm_sub_ps(CONST_1111, psXYfrac); // ? ? (1-y) (1-x) __m128 w_x = _mm_unpacklo_ps(psXYfrac1, psXYfrac); // ? ? x (1-x) w_x = _mm_movelh_ps(w_x, w_x); // x (1-x) x (1-x) __m128 w_y = _mm_shuffle_ps(psXYfrac1, psXYfrac, _MM_SHUFFLE(1, 1, 1, 1)); // y y (1-y) (1-y) // complete weight vector return _mm_mul_ps(w_x, w_y); }
void Quat::ToAxisAngle(float4 &axis, float &angle) const { #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 35.332 nsecs / 94.328 ticks, Avg: 35.870 nsecs, Worst: 57.607 nsecs assume2(this->IsNormalized(), *this, this->Length()); simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)); simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle))); angle = Acos(s4f_x(cosAngle)) * 2.f; simd4f a = mul_ps(q, rcpSinAngle); // Set the w component to zero. simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z] axis.v = _mm_movelh_ps(a, highPart); // [0 z y x] #else // Best: 85.258 nsecs / 227.656 ticks, Avg: 85.492 nsecs, Worst: 86.410 nsecs ToAxisAngle(reinterpret_cast<float3&>(axis), angle); axis.w = 0.f; #endif }
void Quat::SetFromAxisAngle(const float4 &axis, float angle) { assume1(EqualAbs(axis.w, 0.f), axis); assume2(axis.IsNormalized(1e-4f), axis, axis.Length4()); assume1(MATH_NS::IsFinite(angle), angle); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE2) // Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs simd4f halfAngle = set1_ps(0.5f*angle); simd4f sinAngle, cosAngle; sincos_ps(halfAngle, &sinAngle, &cosAngle); simd4f quat = mul_ps(axis, sinAngle); // Set the w component to cosAngle. simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z] q = _mm_movelh_ps(quat, highPart); // [1 z y x] #else // Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs SetFromAxisAngle(axis.xyz(), angle); #endif }
v4f step_t::operator () (float t) const { // Evaluate the polynomial f by Estrin's method. Return // (0 0 0 0) if t < t0, // (f f f f) if t0 <= t < t1, // (1 1 1 1) if t > t1. v4f c4 = load4f (c); v4f one = { 1.0f, 1.0f, 1.0f, 1.0f }; v4f tttt = _mm_set1_ps (t); // t t t t v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t v4f f0 = c4 * tt; // c0 c1*t c2 c3*t v4f ha = _mm_hadd_ps (f0, f0) * tt * tt; v4f f = _mm_hadd_ps (ha, ha); // f f f f v4f f1 = _mm_unpacklo_ps (f, one); // f 1 f 1 v4f tx = load4f (T); // t0 t1 t1 inf v4f lo = _mm_movelh_ps (tx, tx); // t0 t1 t0 t1 v4f hi = _mm_movehl_ps (tx, tx); // t1 inf t1 inf v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi)); v4f val = _mm_and_ps (sel, f1); // f? 1? f? 1? return _mm_hadd_ps (val, val); }
void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); for(unsigned int c = 0; c < channels; ++c) { __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); float *input2 = input + c; for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input1 = _mm_load_ss(input2); input2 += channels; __m128 v_input2 = _mm_load_ss(input2); input2 += channels; __m128 v_input3 = _mm_load_ss(input2); input2 += channels; __m128 v_input4 = _mm_load_ss(input2); input2 += channels; __m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4)); sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); _mm_store_ss(output + c, sum3); } }
/// Test whether this frustum intersects a given axis-aligned bounding box in world space. /// /// @param[in] rBox Box to test. /// /// @return True if the box intersects this frustum, false if not. bool Helium::Simd::Frustum::Intersects( const AaBox& rBox ) const { Helium::Simd::Register boxMinVec = rBox.GetMinimum().GetSimdVector(); Helium::Simd::Register boxMaxVec = rBox.GetMaximum().GetSimdVector(); Helium::Simd::Register boxX0 = _mm_shuffle_ps( boxMinVec, boxMinVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); Helium::Simd::Register boxX1 = _mm_shuffle_ps( boxMaxVec, boxMaxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); Helium::Simd::Register boxY = _mm_shuffle_ps( boxMinVec, boxMaxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ); Helium::Simd::Register boxZ = _mm_unpackhi_ps( boxMinVec, boxMaxVec ); boxZ = _mm_movelh_ps( boxZ, boxZ ); PlaneSoa plane; Vector3Soa points( boxX0, boxY, boxZ ); Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros(); size_t planeCount = ( m_bInfiniteFarClip ? PLANE_FAR : PLANE_MAX ); for( size_t planeIndex = 0; planeIndex < planeCount; ++planeIndex ) { plane.Load1Splat( m_planeA + planeIndex, m_planeB + planeIndex, m_planeC + planeIndex, m_planeD + planeIndex ); points.m_x = boxX0; Helium::Simd::Mask containsPoints0 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec ); points.m_x = boxX1; Helium::Simd::Mask containsPoints1 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec ); int resultMask = _mm_movemask_ps( Helium::Simd::Or( containsPoints0, containsPoints1 ) ); if( resultMask == 0 ) { return false; } } return true; }
vec Quat::Axis() const { assume2(this->IsNormalized(), *this, this->Length()); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) // Best: 6.145 nsecs / 16.88 ticks, Avg: 6.367 nsecs, Worst: 6.529 nsecs assume2(this->IsNormalized(), *this, this->Length()); simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3)); simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle))); simd4f a = mul_ps(q, rcpSinAngle); // Set the w component to zero. simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z] a = _mm_movelh_ps(a, highPart); // [0 z y x] return FLOAT4_TO_DIR(a); #else // Best: 6.529 nsecs / 18.152 ticks, Avg: 6.851 nsecs, Worst: 8.065 nsecs // Convert cos to sin via the identity sin^2 + cos^2 = 1, and fuse reciprocal and square root to the same instruction, // since we are about to divide by it. float rcpSinAngle = RSqrt(1.f - w*w); return DIR_VEC(x, y, z) * rcpSinAngle; #endif }
void decomp_gamma2_plus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* Swap the lower components */ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1); xmm9 = _mm_xor_ps(xmm6, signs14.vector); xmm10 = _mm_xor_ps(xmm7, signs14.vector); xmm11 = _mm_xor_ps(xmm8, signs14.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
void decomp_gamma0_minus( spinor_array src, halfspinor_array dst) { /* c <-> color, s <-> spin */ /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); /* Swap the lower components and multiply by -i*/ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0x1b); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0x1b); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0x1b); xmm9 = _mm_xor_ps(xmm6, signs24.vector); xmm10 = _mm_xor_ps(xmm7, signs24.vector); xmm11 = _mm_xor_ps(xmm8, signs24.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
template<class Dummy> struct call<tag::sort_(tag::simd_<tag::type32_, tag::sse_> ), tag::cpu_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(1) { typedef typename meta::as_real<A0>::type flt; A0 a = {a0}; A0 b = {NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, a0), NT2_CAST(flt, a0)))}; comp(a, b); a = NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, a), NT2_CAST(flt, b))); b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(1, 3, 1, 3))); comp(a, b); A0 c = {NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, b), NT2_CAST(flt, b)))}; A0 d = {a}; comp(c, d); a = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, c), NT2_CAST(flt, a), NT2_SH(3, 2, 0, 0))); b = NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, b), NT2_CAST(flt, d))); b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(3, 1, 0, 2))); return b; } private : template < class T > static inline void comp(T & a,T & b) { T c = nt2::min(a, b); b = nt2::max(a, b);
int main() { #ifndef __EMSCRIPTEN__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif printf ("{ \"workload\": %u, \"results\": [\n", N); assert(N%4 == 0); // Don't care about the tail for now. float *src = get_src();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src[i] = (float)rand() / RAND_MAX; float *src2 = get_src2();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src2[i] = (float)rand() / RAND_MAX; float *dst = get_dst();//(float*)aligned_alloc(16, N*sizeof(float)); float scalarTime; SETCHART("load"); START(); for(int i = 0; i < N; ++i) dst[i] = src[i]; ENDSCALAR(checksum_dst(dst), "scalar"); LS_TEST("_mm_load_ps", _mm_load_ps, 0, _mm_store_ps, 0); LS_TEST("_mm_load_ps1", _mm_load_ps1, 1, _mm_store_ps, 0); LS_TEST("_mm_load_ss", _mm_load_ss, 1, _mm_store_ps, 0); LS_TEST("_mm_load1_ps", _mm_load1_ps, 1, _mm_store_ps, 0); // _mm_loadh_pi // _mm_loadl_pi LS_TEST("_mm_loadr_ps", _mm_loadr_ps, 0, _mm_store_ps, 0); LS_TEST("_mm_loadu_ps", _mm_loadu_ps, 1, _mm_store_ps, 0); SETCHART("set"); SS_TEST("_mm_set_ps", _mm_set_ps(src[i+2], src[i+1], src[i+5], src[i+0])); SS_TEST("_mm_set_ps1", _mm_set_ps1(src[i])); SS_TEST("_mm_set_ss", _mm_set_ss(src[i])); SS_TEST("_mm_set1_ps", _mm_set1_ps(src[i])); SS_TEST("_mm_setr_ps", _mm_set_ps(src[i+2], src[i+1], src[i+5], src[i+0])); SS_TEST("_mm_setzero_ps", _mm_setzero_ps()); SETCHART("move"); SS_TEST("_mm_move_ss", _mm_move_ss(_mm_load_ps(src+i), _mm_load_ps(src2+i))); SS_TEST("_mm_movehl_ps", _mm_movehl_ps(_mm_load_ps(src+i), _mm_load_ps(src2+i))); SS_TEST("_mm_movelh_ps", _mm_movelh_ps(_mm_load_ps(src+i), _mm_load_ps(src2+i))); SETCHART("store"); LS_TEST("_mm_store_ps", _mm_load_ps, 0, _mm_store_ps, 0); LS_TEST("_mm_store_ps1", _mm_load_ps, 0, _mm_store_ps1, 0); LS_TEST("_mm_store_ss", _mm_load_ps, 0, _mm_store_ss, 1); LS64_TEST("_mm_storeh_pi", _mm_load_ps, 0, _mm_storeh_pi, 1); LS64_TEST("_mm_storel_pi", _mm_load_ps, 0, _mm_storel_pi, 1); LS_TEST("_mm_storer_ps", _mm_load_ps, 0, _mm_storer_ps, 0); LS_TEST("_mm_storeu_ps", _mm_load_ps, 0, _mm_storeu_ps, 1); LS_TEST("_mm_stream_ps", _mm_load_ps, 0, _mm_stream_ps, 0); SETCHART("arithmetic"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add"); BINARYOP_TEST("_mm_add_ps", _mm_add_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_add_ss", _mm_add_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div"); BINARYOP_TEST("_mm_div_ps", _mm_div_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_div_ss", _mm_div_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul"); BINARYOP_TEST("_mm_mul_ps", _mm_mul_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_mul_ss", _mm_mul_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub"); BINARYOP_TEST("_mm_sub_ps", _mm_sub_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_sub_ss", _mm_sub_ss, _mm_load_ps(src), _mm_load_ps(src2)); SETCHART("roots"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = 1.f / dst[0]; dst[1] = 1.f / dst[1]; dst[2] = 1.f / dst[2]; dst[3] = 1.f / dst[3]; } ENDSCALAR(checksum_dst(dst), "scalar rcp"); UNARYOP_TEST("_mm_rcp_ps", _mm_rcp_ps, _mm_load_ps(src)); UNARYOP_TEST("_mm_rcp_ss", _mm_rcp_ss, _mm_load_ps(src)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = 1.f / sqrtf(dst[0]); dst[1] = 1.f / sqrtf(dst[1]); dst[2] = 1.f / sqrtf(dst[2]); dst[3] = 1.f / sqrtf(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar rsqrt"); UNARYOP_TEST("_mm_rsqrt_ps", _mm_rsqrt_ps, _mm_load_ps(src)); UNARYOP_TEST("_mm_rsqrt_ss", _mm_rsqrt_ss, _mm_load_ps(src)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrtf(dst[0]); dst[1] = sqrtf(dst[1]); dst[2] = sqrtf(dst[2]); dst[3] = sqrtf(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt"); UNARYOP_TEST("_mm_sqrt_ps", _mm_sqrt_ps, _mm_load_ps(src)); UNARYOP_TEST("_mm_sqrt_ss", _mm_sqrt_ss, _mm_load_ps(src)); SETCHART("logical"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) & fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) & fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) & fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) & fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and"); BINARYOP_TEST("_mm_and_ps", _mm_and_ps, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf((~fcastu(dst[0])) & fcastu(src2[0])); dst[1] = ucastf((~fcastu(dst[1])) & fcastu(src2[1])); dst[2] = ucastf((~fcastu(dst[2])) & fcastu(src2[2])); dst[3] = ucastf((~fcastu(dst[3])) & fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot"); BINARYOP_TEST("_mm_andnot_ps", _mm_andnot_ps, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) | fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) | fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) | fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) | fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or"); BINARYOP_TEST("_mm_or_ps", _mm_or_ps, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) ^ fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) ^ fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) ^ fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) ^ fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor"); BINARYOP_TEST("_mm_xor_ps", _mm_xor_ps, _mm_load_ps(src), _mm_load_ps(src2)); SETCHART("cmp"); #ifndef __EMSCRIPTEN__ // TODO: Disabled due to https://github.com/kripken/emscripten/issues/2841 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp=="); BINARYOP_TEST("_mm_cmpeq_ps", _mm_cmpeq_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmpeq_ss", _mm_cmpeq_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>="); BINARYOP_TEST("_mm_cmpge_ps", _mm_cmpge_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmpge_ss", _mm_cmpge_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>"); BINARYOP_TEST("_mm_cmpgt_ps", _mm_cmpgt_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmpgt_ss", _mm_cmpgt_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<="); BINARYOP_TEST("_mm_cmple_ps", _mm_cmple_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmple_ss", _mm_cmple_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<"); BINARYOP_TEST("_mm_cmplt_ps", _mm_cmplt_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmplt_ss", _mm_cmplt_ss, _mm_load_ps(src), _mm_load_ps(src2)); #endif START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord"); BINARYOP_TEST("_mm_cmpord_ps", _mm_cmpord_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmpord_ss", _mm_cmpord_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord"); BINARYOP_TEST("_mm_cmpunord_ps", _mm_cmpunord_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_cmpunord_ss", _mm_cmpunord_ss, _mm_load_ps(src), _mm_load_ps(src2)); SETCHART("max"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max"); BINARYOP_TEST("_mm_max_ps", _mm_max_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_max_ss", _mm_max_ss, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min"); BINARYOP_TEST("_mm_min_ps", _mm_min_ps, _mm_load_ps(src), _mm_load_ps(src2)); BINARYOP_TEST("_mm_min_ss", _mm_min_ss, _mm_load_ps(src), _mm_load_ps(src2)); SETCHART("shuffle"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle"); // BINARYOP_TEST("_mm_shuffle_ps", _mm_shuffle_ps, _mm_load_ps(src), _mm_load_ps(src2)); START(); __m128 o0 = _mm_load_ps(src); __m128 o1 = _mm_load_ps(src2); for(int i = 0; i < N; i += 4) o0 = _mm_shuffle_ps(o0, o1, _MM_SHUFFLE(1, 0, 3, 2)); _mm_store_ps(dst, o0); END(checksum_dst(dst), "_mm_shuffle_ps"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_ps"); BINARYOP_TEST("_mm_unpackhi_ps", _mm_unpackhi_ps, _mm_load_ps(src), _mm_load_ps(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_ps"); BINARYOP_TEST("_mm_unpacklo_ps", _mm_unpacklo_ps, _mm_load_ps(src), _mm_load_ps(src2)); printf("]}\n"); /* printf("Finished!\n"); printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec()); printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec()); if (scalarTotalTicks > simdTotalTicks) printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks); else printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks); */ #ifdef __EMSCRIPTEN__ fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent")); printf("/*Test finished! Now please close Firefox to continue with benchmark_sse1.py.*/\n"); #endif exit(0); }
/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. // get our data struct: dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data; // adjust to zoom size: const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood if(P <= 1) { // nothing to do from this distance: memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height); return; } // adjust to Lab, make L more important // float max_L = 100.0f, max_C = 256.0f; // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C); float max_L = 120.0f, max_C = 512.0f; float nL = 1.0f/max_L, nC = 1.0f/max_C; const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f }; float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads()); // we want to sum up weights in col[3], so need to init to 0: memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4); // for each shift vector for(int kj=-K;kj<=K;kj++) { for(int ki=-K;ki<=K;ki++) { int inited_slide = 0; // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors) // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory #ifdef _OPENMP # pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa) #endif for(int j=0; j<roi_out->height; j++) { if(j+kj < 0 || j+kj >= roi_out->height) continue; float *S = Sa + dt_get_thread_num() * roi_out->width; const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki); float *out = ((float *)ovoid) + 4*roi_out->width*j; const int Pm = MIN(MIN(P, j+kj), j); const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j); // first line of every thread // TODO: also every once in a while to assert numerical precision! if(!inited_slide) { // sum up a line memset(S, 0x0, sizeof(float)*roi_out->width); for(int jj=-Pm;jj<=PM;jj++) { int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; i<last; i++, inp+=4, inps+=4, s++) { for(int k=0;k<3;k++) s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k]; } } // only reuse this if we had a full stripe if(Pm == P && PM == P) inited_slide = 1; } // sliding window for this line: float *s = S; float slide = 0.0f; // sum up the first -P..P for(int i=0;i<2*P+1;i++) slide += s[i]; for(int i=0; i<roi_out->width; i++) { if(i-P > 0 && i+P<roi_out->width) slide += s[P] - s[-P-1]; if(i+ki >= 0 && i+ki < roi_out->width) { const __m128 iv = { ins[0], ins[1], ins[2], 1.0f }; _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide))); } s ++; ins += 4; out += 4; } if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height) { // sliding window in j direction: int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki); const float *inm = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P); const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } /* Process most of the line 4 pixels at a time */ for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4) { __m128 sv = _mm_load_ps(s); const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps); const __m128 inp2 = _mm_load_ps(inp+4) - _mm_load_ps(inps+4); const __m128 inp3 = _mm_load_ps(inp+8) - _mm_load_ps(inps+8); const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12); const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2); const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4); const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2); const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4); const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo); sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]); const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo); sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]); const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi); sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]); const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms); const __m128 inm2 = _mm_load_ps(inm+4) - _mm_load_ps(inms+4); const __m128 inm3 = _mm_load_ps(inm+8) - _mm_load_ps(inms+8); const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12); const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2); const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4); const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2); const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4); const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo); sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]); const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo); sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]); const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi); sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]); _mm_store_ps(s, sv); } for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } } else inited_slide = 0; } } } // normalize and apply chroma/luma blending // bias a bit towards higher values for low input values: const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6)); const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *out = ((float *)ovoid) + 4*roi_out->width*j; float *in = ((float *)ivoid) + 4*roi_out->width*j; for(int i=0; i<roi_out->width; i++) { _mm_store_ps(out, _mm_add_ps( _mm_mul_ps(_mm_load_ps(in), invert), _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3]))))); out += 4; in += 4; } } // free shared tmp memory: free(Sa); }
void transform8_otherrgb_avx(ThreadInfo* t) { RS_IMAGE16 *input = t->input; GdkPixbuf *output = t->output; RS_MATRIX3 *matrix = t->matrix; gint x,y; gint width; float mat_ps[4*4*3] __attribute__ ((aligned (16))); for (x = 0; x < 4; x++ ) { mat_ps[x] = matrix->coeff[0][0]; mat_ps[x+4] = matrix->coeff[0][1]; mat_ps[x+8] = matrix->coeff[0][2]; mat_ps[12+x] = matrix->coeff[1][0]; mat_ps[12+x+4] = matrix->coeff[1][1]; mat_ps[12+x+8] = matrix->coeff[1][2]; mat_ps[24+x] = matrix->coeff[2][0]; mat_ps[24+x+4] = matrix->coeff[2][1]; mat_ps[24+x+8] = matrix->coeff[2][2]; } int start_x = t->start_x; /* Always have aligned input and output adress */ if (start_x & 3) start_x = ((start_x) / 4) * 4; int complete_w = t->end_x - start_x; /* If width is not multiple of 4, check if we can extend it a bit */ if (complete_w & 3) { if ((t->end_x+4) < input->w) complete_w = ((complete_w+3) / 4 * 4); } __m128 gamma = _mm_set1_ps(t->output_gamma); for(y=t->start_y ; y<t->end_y ; y++) { gushort *i = GET_PIXEL(input, start_x, y); guchar *o = GET_PIXBUF_PIXEL(output, start_x, y); gboolean aligned_write = !((guintptr)(o)&0xf); width = complete_w >> 2; while(width--) { /* Load and convert to float */ __m128i zero = _mm_setzero_si128(); __m128i in = _mm_load_si128((__m128i*)i); // Load two pixels __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels _mm_prefetch(i + 64, _MM_HINT_NTA); __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128i p2 =_mm_unpackhi_epi16(in, zero); __m128i p3 =_mm_unpacklo_epi16(in2, zero); __m128i p4 =_mm_unpackhi_epi16(in2, zero); __m128 p1f = _mm_cvtepi32_ps(p1); __m128 p2f = _mm_cvtepi32_ps(p2); __m128 p3f = _mm_cvtepi32_ps(p3); __m128 p4f = _mm_cvtepi32_ps(p4); /* Convert to planar */ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f); __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f); __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f); __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f); __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2); __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0); __m128 b = _mm_movelh_ps(b1b0, b3b2); /* Apply matrix to convert to sRGB */ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); /* Normalize to 0->1 and clamp */ __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2))); g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2))); b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2))); /* Apply Gamma */ __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma)); b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma)); /* Convert to 8 bit unsigned and interleave*/ __m128i r_i = _mm_cvtps_epi32(r); __m128i g_i = _mm_cvtps_epi32(g); __m128i b_i = _mm_cvtps_epi32(b); r_i = _mm_packs_epi32(r_i, r_i); g_i = _mm_packs_epi32(g_i, g_i); b_i = _mm_packs_epi32(b_i, b_i); /* Set alpha value to 255 and store */ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i); __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i); p1 = _mm_unpacklo_epi32(rg_i, bb_i); p2 = _mm_unpackhi_epi32(rg_i, bb_i); p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2)); if (aligned_write) _mm_store_si128((__m128i*)o, p1); else _mm_storeu_si128((__m128i*)o, p1); i += 16; o += 16; } /* Process remaining pixels */ width = complete_w & 3; while(width--) { __m128i zero = _mm_setzero_si128(); __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128 p1f = _mm_cvtepi32_ps(p1); /* Splat r,g,b */ __m128 r = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0)); __m128 g = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1)); __m128 b = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2)); __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR r = _mm_movelh_ps(r, b2); // BB BB GG RR __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r))); __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); /* Convert to 8 bit unsigned */ zero = _mm_setzero_si128(); __m128i r_i = _mm_cvtps_epi32(r); /* To 16 bit signed */ r_i = _mm_packs_epi32(r_i, zero); /* To 8 bit unsigned - set alpha channel*/ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero)); *(int*)o = _mm_cvtsi128_si32(r_i); i+=4; o+=4; } } }
// ============================================================================= // // sse2_vChirpData // version by: Alex Kan - SSE2 mods (haddsum removal) BH // http://tbp.berkeley.edu/~alexkan/seti/ // int sse2_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128d x1, y1; __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(a1, a1); a2 = _mm_mul_pd(a2, a2); a1 = _mm_mul_pd(a1, rate); a2 = _mm_mul_pd(a2, rate); // reduce the angle to the range (-0.5, 0.5) x1 = _mm_add_pd(a1, roundVal); y1 = _mm_add_pd(a2, roundVal); x1 = _mm_sub_pd(x1, roundVal); y1 = _mm_sub_pd(y1, roundVal); a1 = _mm_sub_pd(a1, x1); a2 = _mm_sub_pd(a2, y1); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(y, SS4); c = _mm_mul_ps(y, CC3); s = _mm_add_ps(s, SS3); c = _mm_add_ps(c, CC2); s = _mm_mul_ps(s, y); c = _mm_mul_ps(c, y); s = _mm_add_ps(s, SS2); c = _mm_add_ps(c, CC1); s = _mm_mul_ps(s, y); c = _mm_mul_ps(c, y); s = _mm_add_ps(s, SS1); s = _mm_mul_ps(s, x); c = _mm_add_ps(c, ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip2(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) c = _mm_mul_ps(c, m); s = _mm_mul_ps(s, m); /* c1 c2 c3 c4 s1 s2 s3 s4 R1 i1 R2 I2 R3 i3 R4 i4 R1 * c1 + (i1 * s1 * -1) i1 * c1 + R1 * s1 R2 * c2 + (i2 * s2 * -1) i2 * c2 + R2 * s2 */ x = d1; y = d2; x = _mm_shuffle_ps(x, x, 0xB1); y = _mm_shuffle_ps(y, y, 0xB1); x = _mm_mul_ps(x, R_NEG); y = _mm_mul_ps(y, R_NEG); cd1 = _mm_shuffle_ps(c, c, 0x50); // 01 01 00 00 AaBb => BBbb => c3c3c4c4 cd2 = _mm_shuffle_ps(c, c, 0xfa); // 11 11 10 10 AaBb => AAaa => c1c1c2c2 td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); td1 = _mm_mul_ps(td1, x); td2 = _mm_mul_ps(td2, y); cd1 = _mm_add_ps(cd1, td1); cd2 = _mm_add_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped+0, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); if( i < ul_NumDataPoints) { // use original routine to finish up any tailings (max stride-1 elements) v_ChirpData(cx_DataArray+i, cx_ChirpDataArray+i , chirp_rate_ind, chirp_rate, ul_NumDataPoints-i, sample_rate); } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
void decomp_gamma3_plus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6; __m128 xmm7; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* sub */ xmm0 = _mm_add_ps(xmm0, xmm3); xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
// ============================================================================= // // sse3_vChirpData // version by: Alex Kan // http://tbp.berkeley.edu/~alexkan/seti/ // int sse3_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate); a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4), SS3), y), SS2), y), SS1), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3), CC2), y), CC1), y), ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) s = _mm_mul_ps(s, m); c = _mm_mul_ps(c, m); // chirp the data cd1 = _mm_shuffle_ps(c, c, 0x50); cd2 = _mm_shuffle_ps(c, c, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); // handle tail elements with scalar code for ( ; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
//--------------------------------------------------------------------------- void tRisaPhaseVocoderDSP::ProcessCore_sse(int ch) { unsigned int framesize_d2 = FrameSize / 2; float * analwork = AnalWork[ch]; float * synthwork = SynthWork[ch]; // 丸めモードを設定 SetRoundingModeToNearest_SSE(); // FFT を実行する rdft(FrameSize, 1, analwork, FFTWorkIp, FFTWorkW); // Real DFT analwork[1] = 0.0; // analwork[1] = nyquist freq. power (どっちみち使えないので0に) __m128 exact_time_scale = _mm_load1_ps(&ExactTimeScale); __m128 over_sampling_radian_v = _mm_load1_ps(&OverSamplingRadian); if(FrequencyScale != 1.0) { // ここでは 4 複素数 (8実数) ごとに処理を行う。 __m128 over_sampling_radian_recp = _mm_load1_ps(&OverSamplingRadianRecp); __m128 frequency_per_filter_band = _mm_load1_ps(&FrequencyPerFilterBand); __m128 frequency_per_filter_band_recp = _mm_load1_ps(&FrequencyPerFilterBandRecp); for(unsigned int i = 0; i < framesize_d2; i += 4) { // インターリーブ解除 + 直交座標系→極座標系 __m128 aw3120 = *(__m128*)(analwork + i*2 ); __m128 aw7654 = *(__m128*)(analwork + i*2 + 4); __m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0)); __m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1)); __m128 mag = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210))); __m128 ang = VFast_arctan2_F4_SSE(im3210, re3210); // 前回の位相との差をとる __m128 lastp = *(__m128*)(LastAnalPhase[ch] + i); *(__m128*)(LastAnalPhase[ch] + i) = ang; ang = _mm_sub_ps(lastp, ang); // over sampling の影響を考慮する __m128 i_3210; i_3210 = _mm_cvtsi32_ss(i_3210, i); i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0)); i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) ); __m128 phase_shift = _mm_mul_ps(i_3210, over_sampling_radian_v); ang = _mm_sub_ps( ang, phase_shift ); // unwrapping をする ang = Wrap_Pi_F4_SSE(ang); // -M_PI~+M_PIを-1.0~+1.0の変位に変換 ang = _mm_mul_ps( ang, over_sampling_radian_recp ); // tmp をフィルタバンド中央からの周波数の変位に変換し、 // それにフィルタバンドの中央周波数を加算する __m128 freq = _mm_mul_ps( _mm_add_ps(ang, i_3210), frequency_per_filter_band ); // analwork に値を格納する re3210 = mag; im3210 = freq; __m128 im10re10 = _mm_movelh_ps(re3210, im3210); __m128 im32re32 = _mm_movehl_ps(im3210, re3210); __m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0)); __m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0)); *(__m128*)(analwork + i*2 ) = im1re1im0re0; *(__m128*)(analwork + i*2 + 4) = im3re3im2re2; } //------------------------------------------------ // 変換 //------------------------------------------------ // 周波数軸方向のリサンプリングを行う float FrequencyScale_rcp = 1.0f / FrequencyScale; for(unsigned int i = 0; i < framesize_d2; i ++) { // i に対応するインデックスを得る float fi = i * FrequencyScale_rcp; // floor(x) と floor(x) + 1 の間でバイリニア補間を行う unsigned int index = static_cast<unsigned int>(fi); // floor float frac = fi - index; if(index + 1 < framesize_d2) { synthwork[i*2 ] = analwork[index*2 ] + frac * (analwork[index*2+2]-analwork[index*2 ]); synthwork[i*2+1] = FrequencyScale * ( analwork[index*2+1] + frac * (analwork[index*2+3]-analwork[index*2+1]) ); } else if(index < framesize_d2) { synthwork[i*2 ] = analwork[index*2 ]; synthwork[i*2+1] = analwork[index*2+1] * FrequencyScale; } else { synthwork[i*2 ] = 0.0; synthwork[i*2+1] = 0.0; } } //------------------------------------------------ // 合成 //------------------------------------------------ // 各フィルタバンドごとに変換 // 基本的には解析の逆変換である for(unsigned int i = 0; i < framesize_d2; i += 4) { // インターリーブ解除 __m128 sw3120 = *(__m128*)(synthwork + i*2 ); __m128 sw7654 = *(__m128*)(synthwork + i*2 + 4); __m128 mag = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(2,0,2,0)); __m128 freq = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(3,1,3,1)); // i+3 i+2 i+1 i+0 を準備 __m128 i_3210; i_3210 = _mm_cvtsi32_ss(i_3210, i); i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0)); i_3210 = _mm_add_ps(i_3210, PM128(PFV_INIT)); // 周波数から各フィルタバンドの中央周波数を減算し、 // フィルタバンドの中央周波数からの-1.0~+1.0の変位 // に変換する __m128 ang = _mm_sub_ps(_mm_mul_ps(freq, frequency_per_filter_band_recp), i_3210); // -1.0~+1.0の変位を-M_PI~+M_PIの位相に変換 ang = _mm_mul_ps( ang, over_sampling_radian_v ); // OverSampling による位相の補正 ang = _mm_add_ps( ang, _mm_mul_ps( i_3210, over_sampling_radian_v ) ); // TimeScale による位相の補正 ang = _mm_mul_ps( ang, exact_time_scale ); // 前回の位相と加算する // ここでも虚数部の符号が逆になるので注意 ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang ); *(__m128*)(LastSynthPhase[ch] + i) = ang; // 極座標系→直交座標系 __m128 sin, cos; VFast_sincos_F4_SSE(ang, sin, cos); __m128 re3210 = _mm_mul_ps( mag, cos ); __m128 im3210 = _mm_mul_ps( mag, sin ); // インターリーブ __m128 im10re10 = _mm_movelh_ps(re3210, im3210); __m128 im32re32 = _mm_movehl_ps(im3210, re3210); __m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0)); __m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0)); *(__m128*)(synthwork + i*2 ) = im1re1im0re0; *(__m128*)(synthwork + i*2 + 4) = im3re3im2re2; } } else { // 周波数軸方向にシフトがない場合 // ここでも 4 複素数 (8実数) ごとに処理を行う。 for(unsigned int i = 0; i < framesize_d2; i += 4) { // インターリーブ解除 + 直交座標系→極座標系 __m128 aw3120 = *(__m128*)(analwork + i*2 ); __m128 aw7654 = *(__m128*)(analwork + i*2 + 4); __m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0)); __m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1)); __m128 mag = _mm_sqrt_ps( _mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210)) ); __m128 ang = VFast_arctan2_F4_SSE(im3210, re3210); // 前回の位相との差をとる __m128 lastp = *(__m128*)(LastAnalPhase[ch] + i); *(__m128*)(LastAnalPhase[ch] + i) = ang; ang = _mm_sub_ps( lastp, ang ); // over sampling の影響を考慮する __m128 i_3210; i_3210 = _mm_cvtsi32_ss(i_3210, i); i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0)); i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) ); __m128 phase_shift = _mm_mul_ps( i_3210, over_sampling_radian_v ); ang = _mm_sub_ps( ang, phase_shift ); // unwrapping をする ang = Wrap_Pi_F4_SSE(ang); // OverSampling による位相の補正 ang = _mm_add_ps( ang, phase_shift ); // TimeScale による位相の補正 ang = _mm_mul_ps( ang, exact_time_scale ); // 前回の位相と加算する // ここでも虚数部の符号が逆になるので注意 ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang ); *(__m128*)(LastSynthPhase[ch] + i) = ang; // 極座標系→直交座標系 __m128 sin, cos; VFast_sincos_F4_SSE(ang, sin, cos); re3210 = _mm_mul_ps( mag, cos ); im3210 = _mm_mul_ps( mag, sin ); // インターリーブ __m128 im10re10 = _mm_movelh_ps(re3210, im3210); __m128 im32re32 = _mm_movehl_ps(im3210, re3210); __m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0)); __m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0)); *(__m128*)(synthwork + i*2 ) = im1re1im0re0; *(__m128*)(synthwork + i*2 + 4) = im3re3im2re2; } } // FFT を実行する synthwork[1] = 0.0; // synthwork[1] = nyquist freq. power (どっちみち使えないので0に) rdft_sse(FrameSize, -1, synthwork, FFTWorkIp, FFTWorkW); // Inverse Real DFT }