void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; __m128 xsum1, xsum2; xsum1 = _mm_setzero_ps(); xsum2 = _mm_setzero_ps(); for (i=0;i<N-3;i+=4) { __m128 xi = _mm_loadu_ps(x+i); __m128 y1i = _mm_loadu_ps(y01+i); __m128 y2i = _mm_loadu_ps(y02+i); xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); } /* Horizontal sum */ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); _mm_store_ss(xy1, xsum1); xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); _mm_store_ss(xy2, xsum2); for (;i<N;i++) { *xy1 = MAC16_16(*xy1, x[i], y01[i]); *xy2 = MAC16_16(*xy2, x[i], y02[i]); } }
/* apparently this is retarded */ void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2) { /* for some reason not aligning the matrix segfaults, * but aligning deadlocks the program */ /* aha we can heavily sse this: * 1. transpose mat2 * 2. dotproduct the rows */ /* 1. transpose mat2 */ __m128 row0, row1, row2, row3; __m128 tmp0, tmp1, tmp2, tmp3; /* Load 4x4 mat2 from memory into four SSE registers. */ row0 = _mm_load_ps( mat2[0] ); row1 = _mm_load_ps( mat2[1] ); row2 = _mm_load_ps( mat2[2] ); row3 = _mm_load_ps( mat2[3] ); /* Interleave bottom/top two pixels from two SSE registers with each other * into a single SSE register. */ tmp0 = _mm_unpacklo_ps( row0, row1 ); tmp2 = _mm_unpacklo_ps( row2, row3 ); tmp1 = _mm_unpackhi_ps( row0, row1 ); tmp3 = _mm_unpackhi_ps( row2, row3 ); /* Move bottom/top two pixels from two SSE registers into one SSE register. */ row0 = _mm_movelh_ps( tmp0, tmp2 ); row1 = _mm_movehl_ps( tmp2, tmp0 ); row2 = _mm_movelh_ps( tmp1, tmp3 ); row3 = _mm_movehl_ps( tmp3, tmp1 ); /* Store 4x4 matrix from all four SSE registers into memory. */ _mm_store_ps( mat2[0], row0 ); _mm_store_ps( mat2[1], row1 ); _mm_store_ps( mat2[2], row2 ); _mm_store_ps( mat2[3], row3 ); /* 2. dotproduct the rows */ /* OMG 16 DOT PRODUCTS */ ret[0][0] = mul_asm(mat1[0], mat2[0]); ret[0][1] = mul_asm(mat1[0], mat2[1]); ret[0][2] = mul_asm(mat1[0], mat2[2]); ret[0][3] = mul_asm(mat1[0], mat2[3]); ret[1][0] = mul_asm(mat1[1], mat2[0]); ret[1][1] = mul_asm(mat1[1], mat2[1]); ret[1][2] = mul_asm(mat1[1], mat2[2]); ret[1][3] = mul_asm(mat1[1], mat2[3]); ret[2][0] = mul_asm(mat1[2], mat2[0]); ret[2][1] = mul_asm(mat1[2], mat2[1]); ret[2][2] = mul_asm(mat1[2], mat2[2]); ret[2][3] = mul_asm(mat1[2], mat2[3]); ret[3][0] = mul_asm(mat1[3], mat2[0]); ret[3][1] = mul_asm(mat1[3], mat2[1]); ret[3][2] = mul_asm(mat1[3], mat2[2]); ret[3][3] = mul_asm(mat1[3], mat2[3]); return; }
/// Transform this box using the specified transform matrix. /// /// @param[in] rTransform Matrix by which to transform. void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform ) { // Expand each corner position. Register minVec = m_minimum.GetSimdVector(); Register maxVec = m_maximum.GetSimdVector(); Vector3Soa corners0; corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ); corners0.m_z = _mm_unpackhi_ps( minVec, maxVec ); corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z ); Vector3Soa corners1; corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners1.m_y = corners0.m_y; corners1.m_z = corners0.m_z; // Transform all corners by the provided transformation matrix. Matrix44Soa transformSplat( rTransform ); transformSplat.TransformPoint( corners0, corners0 ); transformSplat.TransformPoint( corners1, corners1 ); // Compute the minimum. Register minX = Simd::MinF32( corners0.m_x, corners1.m_x ); Register minY = Simd::MinF32( corners0.m_y, corners1.m_y ); Register minXYLo = _mm_unpacklo_ps( minX, minY ); Register minXYHi = _mm_unpackhi_ps( minX, minY ); Register minXY = Simd::MinF32( minXYLo, minXYHi ); Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z ); Register minZLo = _mm_unpacklo_ps( minZ, minZ ); Register minZHi = _mm_unpackhi_ps( minZ, minZ ); minZ = Simd::MinF32( minZLo, minZHi ); Register minLo = _mm_movelh_ps( minXY, minZ ); Register minHi = _mm_movehl_ps( minZ, minXY ); m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) ); // Compute the maximum. Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x ); Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y ); Register maxXYLo = _mm_unpacklo_ps( maxX, maxY ); Register maxXYHi = _mm_unpackhi_ps( maxX, maxY ); Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi ); Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z ); Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ ); Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ ); maxZ = Simd::MaxF32( maxZLo, maxZHi ); Register maxLo = _mm_movelh_ps( maxXY, maxZ ); Register maxHi = _mm_movehl_ps( maxZ, maxXY ); m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) ); }
float calcCubicNoiseValSSE(const vec3 p) { int ix, iy, iz; __m128 fx, fy; float fz; ix = (int)floor(p[0]); fx = _mm_set_ps1(p[0] - ix); iy = (int)floor(p[1]); fy = _mm_set_ps1(p[1] - iy); iz = (int)floor(p[2]); fz = p[2] - iz; uSIMD k0, k1, k2, k3; __m128 out0, out1, out2, out3; for(int k = -1; k <= 2; k++) { for(int j = -1; j <= 2; j++) { k0.a[j+1] = getLatticeVal(ix-1, iy + j, iz + k); k1.a[j+1] = getLatticeVal(ix+0, iy + j, iz + k); k2.a[j+1] = getLatticeVal(ix+1, iy + j, iz + k); k3.a[j+1] = getLatticeVal(ix+2, iy + j, iz + k); } switch(k) { case -1: out0 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 0: out1 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 1: out2 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 2: out3 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; } } // Transpose the matrix formed by the out vectors. __m128 t1 = _mm_movelh_ps(out1, out0); __m128 t2 = _mm_movehl_ps(out0, out1); __m128 t3 = _mm_movelh_ps(out3, out2); __m128 t4 = _mm_movehl_ps(out2, out3); k0.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(0, 2, 0, 2)); k1.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 3, 1, 3)); k2.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(0, 2, 0, 2)); k3.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 3, 1, 3)); uSIMD final_knots; final_knots.m = fourKnotSplineSSE(&fy, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); return clamp(fourKnotSpline(fz, final_knots.a), -1.0f, 1.0f); }
static inline void foo (__m128 *x) { __m128 y = _mm_setzero_ps (); __m128 v = _mm_movehl_ps (y, *x); __m128 w = _mm_movehl_ps (*x, y); check (*x, 9, 1, 2, -3); check (v, 2, -3, 0, 0); check (w, 0, 0, 2, -3); }
int drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments) { int32_t i; float d; moments_t onlinemoments; __m128 x, y, r, r2, s; moments_clear(&onlinemoments); x = load_float3(&coords[3 * index]); for (i = 0; i < n_partners; i++) { y = load_float3(&coords[3 * partners[i]]); r = _mm_sub_ps(x, y); /* x - y */ r2 = _mm_mul_ps(r, r); /* (x - y)**2 */ /* horizontal add the components of d2 with */ /* two instructions. note: it's critical */ /* here that the last entry of x1 and x2 was 0 */ /* so that d2.w = 0 */ s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2)); s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1)); /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not accurate to pass the tests */ _mm_store_ss(&d, s); moments_push(&onlinemoments, 1.0 / sqrt((double) d)); } moments[0] = moments_mean(&onlinemoments); moments[1] = sqrt(moments_second(&onlinemoments)); moments[2] = cbrt(moments_third(&onlinemoments)); return 1; }
_inline float process_folded_fir_sse2(const float *fir_kernel, const float *queue_head, const float *queue_tail, int len) { __m128 acc = _mm_set_ps(0, 0, 0, 0); queue_tail -= 3; len >>= 2; while (len > 0) { __m128 head = _mm_loadu_ps(queue_head); __m128 tail = _mm_loadu_ps(queue_tail); __m128 kern = _mm_load_ps(fir_kernel); tail = _mm_shuffle_ps(tail, tail, 0x1b); // swap the order __m128 t1 = _mm_add_ps(tail, head); // add the head t1 = _mm_mul_ps(t1, kern); // mul acc = _mm_add_ps(acc, t1); // add queue_head += 4; queue_tail -= 4; fir_kernel += 4; len--; } // horizontal sum const __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); const __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); return sum.m128_f32[0]; }
inline void Multiply(const PVector4df &v, PVector4df &out) { #ifdef __SSE_AVAIL__ __m128 v1 = _mm_load_ps(v._Vec); __m128 m0 = _mm_load_ps(Row1); __m128 m1 = _mm_load_ps(Row2); __m128 m2 = _mm_load_ps(Row3); __m128 m3 = _mm_load_ps(Row4); m0 = _mm_mul_ps(m0, v1); //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W) m1 = _mm_mul_ps(m1, v1); //(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W) m2 = _mm_mul_ps(m2, v1); //(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W) m3 = _mm_mul_ps(m3, v1); //(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W) m0 = _mm_hadd_ps(m0, m1); m0 = _mm_hadd_ps(m0, m0); m2 = _mm_hadd_ps(m2, m3); m2 = _mm_hadd_ps(m2, m2); _mm_store_ps(out._Vec, _mm_movehl_ps(m2, m0)); #else out.X = v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41; out.Y = v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42; out.Z = v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43; out.W = v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44; #endif }
void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha) { __m128 s = _mm_set1_ps(alpha); __m128 v1234 = _mm_loadu_ps(u.data()); __m128 v56xx = _mm_loadu_ps(u.data() + 4); __m128 v1212 = _mm_movelh_ps(v1234, v1234); __m128 v3434 = _mm_movehl_ps(v1234, v1234); __m128 v5656 = _mm_movelh_ps(v56xx, v56xx); __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212)); _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212))); _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434))); _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656))); __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434)); _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434))); _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656))); __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656)); _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656))); }
inline PVector4df operator*(const PVector4df &v) { #ifdef __SSE_AVAIL__ __m128 v1 = _mm_load_ps(v._Vec); __m128 m0 = _mm_load_ps(Row1); __m128 m1 = _mm_load_ps(Row2); __m128 m2 = _mm_load_ps(Row3); __m128 m3 = _mm_load_ps(Row4); m0 = _mm_mul_ps(m0, v1); //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W) m1 = _mm_mul_ps(m1, v1); //(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W) m2 = _mm_mul_ps(m2, v1); //(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W) m3 = _mm_mul_ps(m3, v1); //(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W) m0 = _mm_hadd_ps(m0, m1); m0 = _mm_hadd_ps(m0, m0); m2 = _mm_hadd_ps(m2, m3); m2 = _mm_hadd_ps(m2, m2); m0 = _mm_movehl_ps(m2, m0); PVector4df val; _mm_store_ps(val._Vec, m0); return val; #else return PVector4df(v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41, v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42, v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43, v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44); #endif }
int any_ps(__m128 m) { __m128 y = _mm_shuffle_ps(m, m, _MM_SHUFFLE(2,3,0,1)); m = _mm_or_ps(m, y); __m128 z = _mm_movehl_ps(m, m); m = _mm_or_ps(m, z); return _mm_ucomineq_ss(m, _mm_setzero_ps()); }
/// Compute the corners of this view frustum. /// /// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight /// corners) or whether an infinite far clip plane is used (four corners). /// /// Note that this assumes that the frustum is always properly defined, with each possible combination of /// neighboring clip planes intersecting at a valid point. /// /// @param[out] pCorners Array in which the frustum corners will be stored. This must point to a region of memory /// large enough for four points if this frustum has an infinite far clip plane, or eight /// points if this frustum has a normal far clip plane. /// /// @return Number of clip planes computed (either four or eight). size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const { HELIUM_ASSERT( pCorners ); // Compute the corners in struct-of-arrays format. HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST; size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ ); HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 ); // Swizzle the results and store in the output array. Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX ); Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY ); Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ ); Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); if( cornerCount == 8 ) { cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 ); cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 ); cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 ); xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); } return cornerCount; }
sse2_test(void) { u a, b; a.v = setupa (); b.v = setupb (); if (untrue) bar(a.v, b.v); b.v = (__v4sf) _mm_movehl_ps ((__m128)a.v, (__m128)b.v); foo (a, b); }
static inline __m128 horizontal_add(const __m128 a) { #if 0 //!! needs SSE3 const __m128 ftemp = _mm_hadd_ps(a, a); return _mm_hadd_ps(ftemp, ftemp); #else const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3 return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3) #endif }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m128 delta = _mm_set1_ps((float)(resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m128 deltas = _mm_load_ps(delta_table + i); __m128 sinc = _mm_add_ps(_mm_load_ps(phase_table + i), _mm_mul_ps(deltas, delta)); #else __m128 sinc = _mm_load_ps(phase_table + i); #endif sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } // Them annoying shuffles :V // sum_l = { l3, l2, l1, l0 } // sum_r = { r3, r2, r1, r0 } __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); // sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } // sum = { R1, R0, L1, L0 } sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); // sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } // sum = { X, R, X, L } // Store L _mm_store_ss(out_buffer + 0, sum); // movehl { X, R, X, L } == { X, R, X, R } _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
//Thanks stack overflow. static inline float _mm256_reduce_add_ps(__m256 x) { /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */ const int imm = 1; const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x)); /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */ const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */ const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); /* Conversion to float is a no-op on x86-64 */ return _mm_cvtss_f32(x32); }
inline float hadd(const vector4f& rhs) { #if SSE_INSTR_SET >= 3 // SSE3 __m128 tmp0 = _mm_hadd_ps(rhs, rhs); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); #else __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); #endif return _mm_cvtss_f32(tmp1); }
void matrix3_transpose(struct matrix3 *dst, const struct matrix3 *m) { __m128 tmp1, tmp2; vec3_transform(&dst->t, &m->t, m); vec3_neg(&dst->t, &dst->t); tmp1 = _mm_movelh_ps(m->x.m, m->y.m); tmp2 = _mm_movehl_ps(m->y.m, m->x.m); dst->x.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 0, 2, 0)); dst->y.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 1, 3, 1)); dst->z.m = _mm_shuffle_ps(tmp2, m->z.m, _MM_SHUFFLE(3, 2, 2, 0)); }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ static inline float horizontal_sum_avx2(__m256 x) { const __m128 hi_quad = _mm256_extractf128_ps(x, 1); const __m128 lo_quad = _mm256_castps256_ps128(x); const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad); const __m128 lo_dual = sum_quad; const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad); const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual); const __m128 lo = sum_dual; const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1); const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer) { __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned phase = resamp->time >> PHASES_SHIFT; unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK; __m128 delta_f = _mm_set1_ps(delta); const float *phase_table = resamp->phase_table[phase][PHASE_INDEX]; const float *delta_table = resamp->phase_table[phase][DELTA_INDEX]; for (unsigned i = 0; i < TAPS; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); __m128 phases = _mm_load_ps(phase_table + i); __m128 deltas = _mm_load_ps(delta_table + i); __m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } // Them annoying shuffles :V // sum_l = { l3, l2, l1, l0 } // sum_r = { r3, r2, r1, r0 } __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); // sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } // sum = { R1, R0, L1, L0 } sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); // sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } // sum = { X, R, X, L } // Store L _mm_store_ss(out_buffer + 0, sum); // movehl { X, R, X, L } == { X, R, X, R } _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
inline vector4f haddp(const vector4f* row) { #if SSE_INSTR_SET >= 3 // SSE3 return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); #else __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); #endif }
// ---------------------------------------------------------- // Name: matrix::MinValue // Desc: Returns the asbolute minimum element of the // matrix. // ---------------------------------------------------------- float matrix::MinValue() { #ifdef _M_IX86 F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2)); F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4)); F32vec4 min = _mm_min_ps(min1, min2); min = _mm_min_ps(min, _mm_movehl_ps(min,min)); min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01)); return min[0]; #else float min = this->operator()(0, 0); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) if (this->operator()(i, j) < min) min = this->operator()(i, j); return min; #endif // _M_IX86 }
matrix4 matrix4::transposed() const { #ifdef __SSE__ __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps(x.v, y.v); tmp2 = _mm_unpacklo_ps(z.v, w.v); tmp1 = _mm_unpackhi_ps(x.v, y.v); tmp3 = _mm_unpackhi_ps(z.v, w.v); return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1)); #else return matrix4(float4(x.x, y.x, z.x, w.x), float4(x.y, y.y, z.y, w.y), float4(x.z, y.z, z.z, w.z), float4(x.w, y.w, z.w, w.w)); #endif }
// ---------------------------------------------------------- // Name: matrix::MaxValue // Desc: Returns the asbolute maximum element of the // matrix. // ---------------------------------------------------------- float matrix::MaxValue() { #ifdef _M_IX86 F32vec4 max1 = _mm_max_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2)); F32vec4 max2 = _mm_max_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4)); F32vec4 max = _mm_max_ps(max1, max2); max = _mm_max_ps(max, _mm_movehl_ps(max,max)); max = _mm_max_ss(max, _mm_shuffle_ps(max,max,0x01)); return max[0]; #else float max = this->operator()(0,0); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) if (this->operator()(i, j) > max) max = this->operator()(i, j); return max; #endif // _M_IX86 }
void fast(element_t * const elements, const int num_elts, const float a) { element_t * elts = elements; float logf_a = logf(a); float logf_1_a = logf(1.0/a); v4sf log_a = _mm_load1_ps(&logf_a); v4sf log_1_a = _mm_load1_ps(&logf_1_a); assert(num_elts % 3 == 0); // operates on 3 elements at a time // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a); for (int i = 0; i < num_elts; i += 3) { // transpose // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0 v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0 v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0 v4sf r3 = _mm_setzero_ps(); // 0, 0, 0, 0 v4sf t0 = _mm_unpacklo_ps(r0, r1); // x1,x2,y1,y2 v4sf t1 = _mm_unpacklo_ps(r2, r3); // x3,0, y3,0 v4sf t2 = _mm_unpackhi_ps(r0, r1); // z1,z2,0, 0 v4sf t3 = _mm_unpackhi_ps(r2, r3); // z3,0, 0, 0 r0 = _mm_movelh_ps(t0, t1); // x1,x2,x3,0 r1 = _mm_movehl_ps(t1, t0); // y1,y2,y3,0 r2 = _mm_movelh_ps(t2, t3); // z1,z2,z3,0 // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a)) v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0 v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0 v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0 v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0 v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0 v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0 // sum v4sf s1 = _mm_add_ps(ex0, ex1); v4sf s2 = _mm_add_ps(sum1, ex2); // pow(sum, 1/a) = exp(sum * log(1/a)) v4sf ps = _mm_mul_ps(s2, log_1_a); v4sf es = exp_ps(ps); ALIGN16_BEG float re[4] ALIGN16_END; _mm_store_ps(re, es); elts[0].re = re[0]; elts[1].re = re[1]; elts[2].re = re[2]; elts += 3; } }
v4f step_t::operator () (float t) const { // Evaluate the polynomial f by Estrin's method. Return // (0 0 0 0) if t < t0, // (f f f f) if t0 <= t < t1, // (1 1 1 1) if t > t1. v4f c4 = load4f (c); v4f one = { 1.0f, 1.0f, 1.0f, 1.0f }; v4f tttt = _mm_set1_ps (t); // t t t t v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t v4f f0 = c4 * tt; // c0 c1*t c2 c3*t v4f ha = _mm_hadd_ps (f0, f0) * tt * tt; v4f f = _mm_hadd_ps (ha, ha); // f f f f v4f f1 = _mm_unpacklo_ps (f, one); // f 1 f 1 v4f tx = load4f (T); // t0 t1 t1 inf v4f lo = _mm_movelh_ps (tx, tx); // t0 t1 t0 t1 v4f hi = _mm_movehl_ps (tx, tx); // t1 inf t1 inf v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi)); v4f val = _mm_and_ps (sel, f1); // f? 1? f? 1? return _mm_hadd_ps (val, val); }
void R_LocalPointToGlobal( const float modelMatrix[16], const idVec3 &in, idVec3 &out ) { #if defined(MACOS_X) && defined(__i386__) __m128 m0, m1, m2, m3; __m128 in0, in1, in2; float i0,i1,i2; i0 = in[0]; i1 = in[1]; i2 = in[2]; m0 = _mm_loadu_ps(&modelMatrix[0]); m1 = _mm_loadu_ps(&modelMatrix[4]); m2 = _mm_loadu_ps(&modelMatrix[8]); m3 = _mm_loadu_ps(&modelMatrix[12]); in0 = _mm_load1_ps(&i0); in1 = _mm_load1_ps(&i1); in2 = _mm_load1_ps(&i2); m0 = _mm_mul_ps(m0, in0); m1 = _mm_mul_ps(m1, in1); m2 = _mm_mul_ps(m2, in2); m0 = _mm_add_ps(m0, m1); m0 = _mm_add_ps(m0, m2); m0 = _mm_add_ps(m0, m3); _mm_store_ss(&out[0], m0); m1 = (__m128) _mm_shuffle_epi32((__m128i)m0, 0x55); _mm_store_ss(&out[1], m1); m2 = _mm_movehl_ps(m2, m0); _mm_store_ss(&out[2], m2); #else out[0] = in[0] * modelMatrix[0] + in[1] * modelMatrix[4] + in[2] * modelMatrix[8] + modelMatrix[12]; out[1] = in[0] * modelMatrix[1] + in[1] * modelMatrix[5] + in[2] * modelMatrix[9] + modelMatrix[13]; out[2] = in[0] * modelMatrix[2] + in[1] * modelMatrix[6] + in[2] * modelMatrix[10] + modelMatrix[14]; #endif }
static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b) { const int a_nd = ccv_nnc_tensor_nd(a->info.dim); const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1; const int b_nd = ccv_nnc_tensor_nd(b->info.dim); const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1; assert(bdim[0] == bias->info.dim[0]); assert(bdim[0] == w->info.dim[0]); assert(adim[0] == w->info.dim[1]); const int* ainc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc : a->inc + 1) : adim; const int* binc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc : b->inc + 1) : bdim; const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim; const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]); int i; for (i = 0; i < batch_size; i++) { const float* const ap = a->data.f32 + i * ainc[0]; float* const bp = b->data.f32 + i * binc[0]; parallel_for(j, bdim[0]) { const float* const wp = w->data.f32 + j * winc[1]; int k; __m128 v40 = _mm_set_ss(bias->data.f32[j]); __m128 v41 = _mm_setzero_ps(); for (k = 0; k < adim[0] - 7; k += 8) { __m128 ap40 = _mm_load_ps(ap + k); __m128 ap41 = _mm_load_ps(ap + k + 4); __m128 w40 = _mm_load_ps(wp + k); __m128 w41 = _mm_load_ps(wp + k + 4); v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40); v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41); } v40 = _mm_add_ps(v40, v41); v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40)); v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1)); _mm_store_ss(bp + j, v40); } parallel_endfor } return CCV_NNC_EXEC_SUCCESS; }
// ---------------------------------------------------------- // Name: matrix::Determinant // Desc: Return the matrix determinant. A = det[M]. // ---------------------------------------------------------- float matrix::Determinant() { #ifdef _M_IX86 __m128 Va,Vb,Vc; __m128 r1,r2,r3,t1,t2,sum; F32vec4 Det; // First, Let's calculate the first four minterms of // the first line t1 = _L4; t2 = _mm_ror_ps(_L3,1); // V3'·V4 Vc = _mm_mul_ps(t2,_mm_ror_ps(t1,0)); // V3'·V4" Va = _mm_mul_ps(t2,_mm_ror_ps(t1,2)); // V3'·V4^ Vb = _mm_mul_ps(t2,_mm_ror_ps(t1,3)); // V3"·V4^ - V3^·V4" r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3^·V4' - V3'·V4^ r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3'·V4" - V3"·V4' r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); Va = _mm_ror_ps(_L2,1); sum = _mm_mul_ps(Va,r1); Vb = _mm_ror_ps(Va,1); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2)); Vc = _mm_ror_ps(Vb,1); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3)); // Now we can calculate the determinant: Det = _mm_mul_ps(sum,_L1); Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det)); Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1)); return Det[0]; #else // TODO return 0.0f; #endif // _M_IX86 }