int QpskSymbolMapper::process(Buffer* const dataIn, Buffer* dataOut) { PDEBUG("QpskSymbolMapper::process" "(dataIn: %p, dataOut: %p)\n", dataIn, dataOut); dataOut->setLength(dataIn->getLength() * 4 * 2 * sizeof(float)); // 4 output complex symbols per input byte #ifdef __SSE__ const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData()); __m128* out = reinterpret_cast<__m128*>(dataOut->getData()); if (dataIn->getLength() % (d_carriers / 4) != 0) { fprintf(stderr, "%zu (input size) %% (%zu (carriers) / 4) != 0\n", dataIn->getLength(), d_carriers); throw std::runtime_error( "QpskSymbolMapper::process input size not valid!"); } const static __m128 symbols[16] = { _mm_setr_ps( M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2), _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, M_SQRT1_2, M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2), _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2) }; size_t inOffset = 0; size_t outOffset = 0; uint8_t tmp = 0; for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) { for (size_t j = 0; j < d_carriers / 8; ++j) { tmp = (in[inOffset] & 0xc0) >> 4; tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6; out[outOffset] = symbols[tmp]; tmp = (in[inOffset] & 0x30) >> 2; tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4; out[outOffset + 1] = symbols[tmp]; tmp = (in[inOffset] & 0x0c); tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2; out[outOffset + 2] = symbols[tmp]; tmp = (in[inOffset] & 0x03) << 2; tmp |= (in[inOffset + (d_carriers / 8)] & 0x03); out[outOffset + 3] = symbols[tmp]; ++inOffset; outOffset += 4; } inOffset += d_carriers / 8; } #else // !__SSE__ const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData()); float* out = reinterpret_cast<float*>(dataOut->getData()); if (dataIn->getLength() % (d_carriers / 4) != 0) { throw std::runtime_error( "QpskSymbolMapper::process input size not valid!"); } if (dataOut->getLength() / sizeof(float) != dataIn->getLength() * 4 * 2) { // 4 output complex symbols per input byte throw std::runtime_error( "QpskSymbolMapper::process output size not valid!"); } const static float symbols[16][4] = { { M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}, { M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2}, { M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}, { M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2}, { M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2}, { M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}, { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2}, { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}, {-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}, {-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2}, {-M_SQRT1_2,- M_SQRT1_2, M_SQRT1_2, M_SQRT1_2}, {-M_SQRT1_2,- M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2}, {-M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2}, {-M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}, {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2}, {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2} }; size_t inOffset = 0; size_t outOffset = 0; uint8_t tmp; for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) { for (size_t j = 0; j < d_carriers / 8; ++j) { tmp = (in[inOffset] & 0xc0) >> 4; tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6; memcpy(&out[outOffset], symbols[tmp], sizeof(float) * 4); tmp = (in[inOffset] & 0x30) >> 2; tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4; memcpy(&out[outOffset + 4], symbols[tmp], sizeof(float) * 4); tmp = (in[inOffset] & 0x0c); tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2; memcpy(&out[outOffset + 8], symbols[tmp], sizeof(float) * 4); tmp = (in[inOffset] & 0x03) << 2; tmp |= (in[inOffset + (d_carriers / 8)] & 0x03); memcpy(&out[outOffset + 12], symbols[tmp], sizeof(float) * 4); ++inOffset; outOffset += 4*4; } inOffset += d_carriers / 8; } #endif // __SSE__ return 1; }
static float ks_donor_acceptor(const float* xyz, const float* hcoords, const int* nco_indices, int donor, int acceptor) { /* Conpute the Kabsch-Sander hydrogen bond energy between two residues in a single conformation. Parameters ---------- xyz : array, shape=(n_atoms, 3) All of the atoms in this frame nhco0 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in one residue. nhco1 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in the other residue. donor : int Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor. Returns ------- energy : float The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5 is considered significant. */ float energy; __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno; __m128 coupling, recip_sqrt, one; one = _mm_set1_ps(1.0f); /* 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A) */ coupling = _mm_setr_ps(-2.7888f, -2.7888f, 2.7888f, 2.7888f); r_n = load_float3(xyz + 3*nco_indices[3*donor]); r_h = load_float3(hcoords + 3*donor); r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]); r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]); /* printf("Donor Index %d\n", donor); printf("Acceptor Index %d\n", acceptor); printf("N index %d\n", 3*nco_indices[3*donor + 0]); printf("C index %d\n", 3*nco_indices[3*acceptor + 1]); printf("O index %d\n", 3*nco_indices[3*acceptor + 2]); printf("\nrN "); printf_m128(r_n); printf("rH "); printf_m128(r_h); printf("rC "); printf_m128(r_c); printf("rO "); printf_m128(r_o); */ r_ho = _mm_sub_ps(r_h, r_o); r_hc = _mm_sub_ps(r_h, r_c); r_nc = _mm_sub_ps(r_n, r_c); r_no = _mm_sub_ps(r_n, r_o); /* compute all four dot products (each of the squared distances), and then */ /* pack them into a single float4 using three shuffles. */ d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps2(r_ho, r_ho, 0xF3), _mm_dp_ps2(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(_mm_dp_ps2(r_hc, r_hc, 0xF3), _mm_dp_ps2(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)), _MM_SHUFFLE(2,0,2,0)); /* rsqrt_ps is really not that accurate... */ recip_sqrt = _mm_div_ps(one, _mm_sqrt_ps(d2_honchcno)); energy = _mm_cvtss_f32(_mm_dp_ps2(coupling, recip_sqrt, 0xFF)); // energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF)); return (energy < -9.9f ? -9.9f : energy); }
inline vector4f(float f0, float f1, float f2, float f3) : m_value(_mm_setr_ps(f0, f1, f2, f3)) { }
static inline Simd set(float a, float b, float c, float d, float e, float f, float g, float h) { Simd res; res.reg[0] = _mm_setr_ps(a, b, c, d); res.reg[1] = _mm_setr_ps(e, f, g, h); return res; }
static inline Simd set(float x, float y, float z, float w) { Simd res; res.reg = _mm_setr_ps(x, y, z, w); return res; }
namespace Demi { const ArrayFloat MathlibSSE2::HALF = _mm_set_ps1( 0.5f ); const ArrayFloat MathlibSSE2::ONE = _mm_set_ps1( 1.0f ); const ArrayFloat MathlibSSE2::THREE = _mm_set_ps1( 3.0f ); const ArrayFloat MathlibSSE2::NEG_ONE = _mm_set_ps1( -1.0f ); const ArrayFloat MathlibSSE2::fEpsilon = _mm_set_ps1( 1e-6f ); const ArrayFloat MathlibSSE2::fSqEpsilon = _mm_set_ps1( 1e-12f ); const ArrayFloat MathlibSSE2::OneMinusEpsilon= _mm_set_ps1( 1.0f - 1e-6f ); const ArrayFloat MathlibSSE2::FLOAT_MIN = _mm_set_ps1( std::numeric_limits<float>::min() ); const ArrayFloat MathlibSSE2::SIGN_MASK = _mm_set_ps1( -0.0f ); const ArrayFloat MathlibSSE2::INFINITEA = _mm_set_ps1( std::numeric_limits<float>::infinity() ); const ArrayFloat MathlibSSE2::MAX_NEG = _mm_set_ps1( -std::numeric_limits<float>::max() ); const ArrayFloat MathlibSSE2::MAX_POS = _mm_set_ps1( std::numeric_limits<float>::max() ); const ArrayFloat MathlibSSE2::LAST_AFFINE_COLUMN = _mm_setr_ps( 0, 0, 0, 1 ); static const float _PI = float( 4.0 * atan( 1.0 ) ); //We can't use Math::fDeg2Rad & Math::fRad2Deg directly because //it's not guaranteed to have been initialized first const ArrayFloat MathlibSSE2::PI = _mm_set_ps1( _PI ); const ArrayFloat MathlibSSE2::TWO_PI = _mm_set_ps1( 2.0f * _PI ); const ArrayFloat MathlibSSE2::fDeg2Rad = _mm_set_ps1( _PI / 180.0f ); const ArrayFloat MathlibSSE2::fRad2Deg = _mm_set_ps1( 180.0f / _PI ); const ArrayFloat MathlibSSE2::ONE_DIV_2PI= _mm_set_ps1( 1.0f / (2.0f * _PI) ); ArrayFloat MathlibSSE2::Sin4( ArrayFloat x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); return sin_ps( x ); } ArrayFloat MathlibSSE2::Cos4( ArrayFloat x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); return cos_ps( x ); } void MathlibSSE2::SinCos4( ArrayFloat x, ArrayFloat &outSin, ArrayFloat &outCos ) { // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap // between cos & sin depending on which quadrant it fell: // Quadrant | sin | cos // n = 0 -> sin( x ), cos( x ) // n = 1 -> cos( x ), -sin( x ) // n = 2 -> -sin( x ), -cos( x ) // n = 3 -> -sin( x ), sin( x ) // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS: // Good to the Last Bit // K. C. Ng and themembers of the FP group of SunPro // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf // -- Perhaps we can leave this to GSoC students? -- // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); sincos_ps( x, &outSin, &outCos ); } const ArrayFloat BooleanMask4::mMasks[NUM_MASKS] = { _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )),//MASK_NONE _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0xffffffff )),//MASK_X _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0xffffffff, 0x00000000 )),//MASK_Y _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0xffffffff, 0xffffffff )),//MASK_XY _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0x00000000 )),//MASK_Z _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0xffffffff )),//MASK_XZ _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0xffffffff, 0x00000000 )),//MASK_YZ _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff )),//MASK_XYZ _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 )),//MASK_W _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0xffffffff )),//MASK_XW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0x00000000 )),//MASK_YW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0xffffffff )),//MASK_XYW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )),//MASK_ZW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0xffffffff )),//MASK_XZW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 )),//MASK_YZW _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff )) //MASK_XYZW }; }
void fht_SSE2(FLOAT * fz, int n) { const FLOAT *tri = costab; int k4; FLOAT *fi, *gi; FLOAT const *fn; n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */ fn = fz + n; k4 = 4; do { FLOAT s1, c1; int i, k1, k2, k3, kx; kx = k4 >> 1; k1 = k4; k2 = k4 << 1; k3 = k2 + k1; k4 = k2 << 1; fi = fz; gi = fi + kx; do { FLOAT f0, f1, f2, f3; f1 = fi[0] - fi[k1]; f0 = fi[0] + fi[k1]; f3 = fi[k2] - fi[k3]; f2 = fi[k2] + fi[k3]; fi[k2] = f0 - f2; fi[0] = f0 + f2; fi[k3] = f1 - f3; fi[k1] = f1 + f3; f1 = gi[0] - gi[k1]; f0 = gi[0] + gi[k1]; f3 = SQRT2 * gi[k3]; f2 = SQRT2 * gi[k2]; gi[k2] = f0 - f2; gi[0] = f0 + f2; gi[k3] = f1 - f3; gi[k1] = f1 + f3; gi += k4; fi += k4; } while (fi < fn); c1 = tri[0]; s1 = tri[1]; for (i = 1; i < kx; i++) { __m128 v_s2; __m128 v_c2; __m128 v_c1; __m128 v_s1; FLOAT c2, s2, s1_2 = s1+s1; c2 = 1 - s1_2 * s1; s2 = s1_2 * c1; fi = fz + i; gi = fz + k1 - i; v_c1 = _mm_set_ps1(c1); v_s1 = _mm_set_ps1(s1); v_c2 = _mm_set_ps1(c2); v_s2 = _mm_set_ps1(s2); { static const vecfloat_union sign_mask = {{0x80000000,0,0,0}}; v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */ } { static const vecfloat_union sign_mask = {{0,0x80000000,0,0}}; v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */ } { static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}}; v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */ } do { __m128 p, q, r; q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/ p = _mm_mul_ps(_mm_set_ps1(s2), q); /* P := s2 * Q */ q = _mm_mul_ps(v_c2, q); /* Q := c2 * Q */ q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2)); /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */ p = _mm_add_ps(p, q); r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]); /* R := {gi_0,gi_k2,fi_0,fi_k2} */ q = _mm_sub_ps(r, p); /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */ r = _mm_add_ps(r, p); /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */ p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0)); /* P := {q0,q2,r0,r2} */ p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0)); /* P := {q0,r0,q2,r2} */ q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1)); /* Q := {q1,q3,r1,r3} */ r = _mm_mul_ps(v_c1, q); q = _mm_mul_ps(v_s1, q); q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3)); /* Q := {q3,q2,q1,q0} */ q = _mm_add_ps(q, r); store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]); store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]); gi += k4; fi += k4; } while (fi < fn); c2 = c1; c1 = c2 * tri[0] - s1 * tri[1]; s1 = c2 * tri[1] + s1 * tri[0]; } tri += 2; } while (k4 < n); }
mlib_status __mlib_SignalLimit_F32S_F32S( mlib_f32 *dst, const mlib_f32 *src, const mlib_f32 *low, const mlib_f32 *high, mlib_s32 n) { mlib_s32 i, count; mlib_f32 tl0, tl1; mlib_f32 th0, th1; mlib_f32 x, x1; __m128 t_low, t_high; __m128 dx; mlib_f32 *psrc = (mlib_f32 *)src; mlib_f32 *pdst = (mlib_f32 *)dst; mlib_s32 samples = 2 * n; tl0 = low[0]; th0 = high[0]; tl1 = low[1]; th1 = high[1]; if ((tl0 > th0) || (tl1 > th1) || (n <= 0)) return (MLIB_FAILURE); count = (16 - ((mlib_addr)psrc & 15)) >> 2; if (count > samples) count = samples; for (i = 0; i < count - 1; i += 2) { x = (*psrc++); x = (x < tl0) ? tl0 : x; x = (x >= th0) ? th0 : x; (*pdst++) = x; x1 = (*psrc++); x1 = (x1 < tl1) ? tl1 : x1; x1 = (x1 >= th1) ? th1 : x1; (*pdst++) = x1; } if (count & 1) { x = (*psrc++); x = (x < tl0) ? tl0 : x; x = (x >= th0) ? th0 : x; (*pdst++) = x; } if (count & 1) { t_low = _mm_setr_ps(tl1, tl0, tl1, tl0); t_high = _mm_setr_ps(th1, th0, th1, th0); } else { t_low = _mm_setr_ps(tl0, tl1, tl0, tl1); t_high = _mm_setr_ps(th0, th1, th0, th1); } samples -= count; if ((mlib_addr)pdst & 15) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < samples >> 2; i++) { dx = _mm_load_ps(psrc + 4 * i); dx = _mm_max_ps(dx, t_low); dx = _mm_min_ps(dx, t_high); _mm_storeu_ps(pdst + 4 * i, dx); } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < samples >> 2; i++) { dx = _mm_load_ps(psrc + 4 * i); dx = _mm_max_ps(dx, t_low); dx = _mm_min_ps(dx, t_high); _mm_store_ps(pdst + 4 * i, dx); } } i <<= 2; psrc += i; pdst += i; if (count & 1 && i < samples) { x = (*psrc++); x = (x < tl1) ? tl1 : x; x = (x >= th1) ? th1 : x; (*pdst++) = x; i ++; } for (; i < samples - 1; i += 2) { x = (*psrc++); x = (x < tl0) ? tl0 : x; x = (x >= th0) ? th0 : x; (*pdst++) = x; x1 = (*psrc++); x1 = (x1 < tl1) ? tl1 : x1; x1 = (x1 >= th1) ? th1 : x1; (*pdst++) = x1; } return (MLIB_SUCCESS); }
void init_sse_data() { #ifdef HAVE_SSE if (A_s == 0) { posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12)); A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); A_s[1] = _mm_setr_ps ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0 ); A_s[2] = _mm_setr_ps ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0 ); A_s[3] = _mm_setr_ps ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0 ); A_s[4] = _mm_setr_ps ( -0.5, 1.0, -0.5, 0.0 ); A_s[5] = _mm_setr_ps ( 0.0, -2.0, 1.5, 0.0 ); A_s[6] = _mm_setr_ps ( 0.5, 1.0, -1.5, 0.0 ); A_s[7] = _mm_setr_ps ( 0.0, 0.0, 0.5, 0.0 ); A_s[8] = _mm_setr_ps ( 1.0, -1.0, 0.0, 0.0 ); A_s[9] = _mm_setr_ps ( -2.0, 3.0, 0.0, 0.0 ); A_s[10] = _mm_setr_ps ( 1.0, -3.0, 0.0, 0.0 ); A_s[11] = _mm_setr_ps ( 0.0, 1.0, 0.0, 0.0 ); } #endif #ifdef HAVE_SSE2 if (A_d == 0) { posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24)); A_d[ 0] = _mm_setr_pd ( 3.0/6.0, -1.0/6.0 ); A_d[ 1] = _mm_setr_pd ( 1.0/6.0, -3.0/6.0 ); A_d[ 2] = _mm_setr_pd ( -6.0/6.0, 3.0/6.0 ); A_d[ 3] = _mm_setr_pd ( 4.0/6.0, 0.0/6.0 ); A_d[ 4] = _mm_setr_pd ( 3.0/6.0, -3.0/6.0 ); A_d[ 5] = _mm_setr_pd ( 1.0/6.0, 3.0/6.0 ); A_d[ 6] = _mm_setr_pd ( 0.0/6.0, 1.0/6.0 ); A_d[ 7] = _mm_setr_pd ( 0.0/6.0, 0.0/6.0 ); A_d[ 8] = _mm_setr_pd ( -0.5, 0.0 ); A_d[ 9] = _mm_setr_pd ( -0.5, 1.0 ); A_d[10] = _mm_setr_pd ( 1.5, 0.0 ); A_d[11] = _mm_setr_pd ( 0.0, -2.0 ); A_d[12] = _mm_setr_pd ( -1.5, 0.0 ); A_d[13] = _mm_setr_pd ( 0.5, 1.0 ); A_d[14] = _mm_setr_pd ( 0.5, 0.0 ); A_d[15] = _mm_setr_pd ( 0.0, 0.0 ); A_d[16] = _mm_setr_pd ( 0.0, 0.0 ); A_d[17] = _mm_setr_pd ( 1.0, -1.0 ); A_d[18] = _mm_setr_pd ( 0.0, 0.0 ); A_d[19] = _mm_setr_pd ( -2.0, 3.0 ); A_d[20] = _mm_setr_pd ( 0.0, 0.0 ); A_d[21] = _mm_setr_pd ( 1.0, -3.0 ); A_d[22] = _mm_setr_pd ( 0.0, 0.0 ); A_d[23] = _mm_setr_pd ( 0.0, 1.0 ); } #endif }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
#include "AL/alc.h" #include "alMain.h" #include "alu.h" #include "alSource.h" #include "alAuxEffectSlot.h" #include "mixer_defs.h" static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2], const ALuint IrSize, ALfloat (*restrict Coeffs)[2], const ALfloat (*restrict CoeffStep)[2], ALfloat left, ALfloat right) { const __m128 lrlr = _mm_setr_ps(left, right, left, right); __m128 coeffs, deltas, imp0, imp1; __m128 vals = _mm_setzero_ps(); ALuint i; if((Offset&1)) { const ALuint o0 = Offset&HRIR_MASK; const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK; coeffs = _mm_load_ps(&Coeffs[0][0]); deltas = _mm_load_ps(&CoeffStep[0][0]); vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]); imp0 = _mm_mul_ps(lrlr, coeffs); coeffs = _mm_add_ps(coeffs, deltas); vals = _mm_add_ps(imp0, vals);
int main() { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dis(0, 255); size_t max_iter = 20; size_t array_size = 800; size_t vector_size = array_size*4; vfloat32 *vX1, *vX2, *vY , *vY1 , *vY2; std::vector<float> vec1(vector_size) , vec2(vector_size) , vecy(vector_size , 0.) , vecy1(vector_size,0.) , vecy2(vector_size, 0.); // SIMD vectors must be 16 bits aligned vX1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16); vX2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16); vY =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16); vY1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16); vY2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16); vfloat32 vy = _mm_set_ps(0,0,0,0); int j = 0; // Initialize vectors and simd arrays for(size_t i = 0 ; i < array_size ; ++i) { float r1 = dis(gen) , r2 = dis(gen) , r3 = dis(gen) , r4 = dis(gen); float r5 = dis(gen) , r6 = dis(gen) , r7 = dis(gen) , r8 = dis(gen); vec1[j] = r1; vec1[j+1] = r2 ; vec1[j+2] = r3 ; vec1[j+3] = r4; vec2[j] = r5; vec2[j+1] = r6 ; vec2[j+2] = r7 ; vec2[j+3] = r8; vfloat32 vx1 = _mm_set_ps(r4 , r3 , r2 , r1 ); vfloat32 vx2 = _mm_set_ps(r8 , r7 , r6 , r5 ); _mm_store_ps((float*) &vX1[i], vx1); _mm_store_ps((float*) &vX2[i], vx2); _mm_store_ps((float*) &vY[i], vy); _mm_store_ps((float*) &vY1[i], vy); _mm_store_ps((float*) &vY2[i], vy); j +=4; } // test pour l'addition de vectors { auto start = std::chrono::steady_clock::now(); vectoradd_simd(vX1,vX2,vY,array_size); auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end-start; // std::cout << "vector addition time with simd: " << diff.count() << " s" << std::endl; start = std::chrono::steady_clock::now(); std::transform( vec1.begin() , vec1.end() , vec2.begin() , vecy.begin() , std::plus<float>()); end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff1 = end-start; // std::cout << "vector addition time without simd: " << diff1.count() << " s" << std::endl; j = 0; bool is_valid = true; for(size_t i = 0 ; i < array_size ; ++i) { float out[4] ; _mm_store_ps(out , vY[i]); if ( out[0] == vecy[j] && out[1] == vecy[j+1] && out[2] == vecy[j+2] && out[3] == vecy[j+3]) { j += 4;} else { is_valid = false; break; } } if(is_valid) { std::cout << "l'addition de vecteurs en simd est correcte" << std::endl; std::cout << "speedup obtained for vector addition with simd : " << diff1.count() / diff.count() << std::endl; } else { std::cout << " l'addition de vecteurs end simd est incorrecte" << std::endl; } std::cout << "\n"; } // test pour le dot product { auto start = std::chrono::steady_clock::now(); vfloat32 sres = vectordot_simd(vX1 , vX2 , array_size); auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end-start; // std::cout << "dot product time with simd: " << diff.count() << " s" << std::endl; start = std::chrono::steady_clock::now(); float res = std::inner_product( vec1.begin() , vec1.end() , vec2.begin() , 0. ); end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff1 = end-start; // std::cout << "dot product time without simd: " << diff1.count() << " s" << std::endl; float out[4] ; _mm_store_ps( out , sres); if( std::abs(out[0] - res ) < 0.01f ) { std::cout << "le produit de vecteurs en simd est correct" << std::endl; std::cout << "speedup obtained for dot product with simd : " << diff1.count() / diff.count() << std::endl; } else {std::cout << "le produit de vecteurs en simd est incorrect : " << out[0] << " " << res << std::endl;} std::cout << "\n"; } // test for 1D filtre with rotation without border check { auto start = std::chrono::steady_clock::now(); float divide = 1./3. ; for(std::size_t i = 1 ; i < vector_size-1 ; ++i) { vecy1[i] = divide * ( vec1[i-1] + vec1[i] + vec1[i+1] ); } auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff1 = end-start;; start = std::chrono::steady_clock::now(); vectoravg3_simd(vX1 , vY1 , array_size); end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end-start; j = 4; bool is_valid = true; for(size_t i = 1 ; i < array_size-1 ; ++i) { float out[4] ; _mm_store_ps(out , vY1[i]); if ( is_valid == true && out[0] == vecy1[j] && out[1] == vecy1[j+1] && out[2] == vecy1[j+2] && out[3] == vecy1[j+3]) { j += 4;} else { is_valid = false; break; } } if(is_valid) { std::cout << "la filtre moyenneur en simd est correct" << std::endl; std::cout << "speedup obtained for average filter with simd : " << diff1.count() / diff.count() << std::endl; } else { std::cout << "la filtre moyenneur en simd est incorrect" << std::endl; } std::cout << "\n"; } bool valid_mandel = false; // test for mandelbrot { std::vector<float> mandel_test(4,0); std::vector<float> mandel_test1(4,0); std::vector<size_t> indx(4,0); vfloat32 mdt = _mm_set1_ps(0); vfloat32 mdt1 = _mm_set1_ps(0); mandel_test[0] = -0.70; mandel_test[1] = -0.80; mandel_test[2] = -0.90; mandel_test[3] = -1.00; mandel_test1[0] = +0.10; mandel_test1[1] = +0.30; mandel_test1[2] = +0.30; mandel_test1[3] = +0.40; mdt = _mm_setr_ps(-1.00, -0.90, -0.80, -0.70); mdt1 = _mm_setr_ps(+0.40, +0.30, +0.30, +0.10); auto start = std::chrono::steady_clock::now(); for(std::size_t i = 0 ; i < 4 ; ++i ) { indx[i] = mandelbrot_scalar(mandel_test[i] , mandel_test1[i] , max_iter ); } auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff1 = end-start;; start = std::chrono::steady_clock::now(); vuint32 res_mandel = mandelbrot_simd(mdt, mdt1 , max_iter); end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end-start; unsigned int out[4] __attribute__((aligned(16))) ; __m128i* po = (__m128i*) &out[0] ; _mm_store_si128(po, res_mandel); bool v1 = false , v2 = false; if( indx[0] == 20 && indx[1] == 8 && indx[2] == 10 && indx[3] == 6 ) { v1 = true; std::cout << "la fonction mandelbrot en scalaire est correcte" << std::endl; } else { std::cout << "la fonction mandelbrot en scalaire est incorrecte" << std::endl; std::cout << "le bon résultat est : 20 8 10 6 \n" << "vous avez obtenu : "; vec_display(indx,0); } if( out[3] == 20 && out[2] == 8 && out[1] == 10 && out[0] == 6 ) { v2 = true; std::cout << "la fonction mandelbrot en SIMD est correcte" << std::endl; } else { std::cout << "la fonction mandelbrot en SIMD est incorrecte" << std::endl; std::cout << "le bon résultat est 20 8 10 6 \n" << "vous avez obtenu : "; simd_display_i32(res_mandel); } if ( v1 && v2 ) { std::cout << "speedup obtained for mandelbrot : " << diff1.count() / diff.count() << std::endl; valid_mandel = true; } } // test for mandelbrot function { if(valid_mandel) { std::cout << "\n-----------------------------" << std::endl; std::cout << "------ benchmandelbrot ------" << std::endl; std::cout << "-----------------------------\n" << std::endl; size_t h = SIZE , w = SIZE ; std::vector<size_t> indx(h*w,0); vfloat32 mdt = _mm_set1_ps(0); vfloat32 mdt1 = _mm_set1_ps(0); float a0 = -1.5 , a1 = +0.5; float b0 = -1.0 , b1 = +1.0; float avg_cycles_vec = 0; float avg_time_vec = 0; size_t num_iter = 200; for(size_t i =0 ; i < num_iter ; ++i) { auto start = std::chrono::steady_clock::now(); auto cycles_s = rdtsc(); calc_mandelbrot_scalar( indx , h , w , a0 , a1 , b0 , b1 , max_iter ); auto cycles_e = rdtsc(); auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff1 = end-start; avg_time_vec += diff1.count() ; avg_cycles_vec += cycles_e - cycles_s; } avg_time_vec /= num_iter ; avg_cycles_vec /= num_iter ; std::cout << " mandelbrot vector time : " << avg_time_vec << std::endl; std::cout << " mandelbrot vector cycles time : " << avg_cycles_vec << std::endl; vuint32 **Simd_indx = (vuint32**)_mm_malloc ((size_t)( h*sizeof(vuint32*)), 16); if (Simd_indx) { for (size_t i = 0; i < w; i++) { Simd_indx[i] = (vuint32*) _mm_malloc ((size_t) (w*sizeof(vuint32)), 16); } } float avg_cycles_simd = 0; float avg_time_simd = 0; for(size_t i = 0 ; i < num_iter ; ++i) { auto start = std::chrono::steady_clock::now(); auto cycles_s = rdtsc(); calc_mandelbrot_simd( Simd_indx , h , w , a0 , a1 , b0 , b1 , max_iter ); auto cycles_e = rdtsc(); auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end-start; avg_time_simd += diff.count() ; avg_cycles_simd += cycles_e - cycles_s; } avg_time_simd /= num_iter ; avg_cycles_simd /= num_iter ; std::cout << " mandelbrot SIMD time : " << avg_time_simd << std::endl; std::cout << " mandelbrot SIMD cycles time : " << avg_cycles_simd << std::endl; std::cout << "speedup obtained for mandelbrot : " << avg_time_vec / avg_time_simd << std::endl; std::cout << "speedup in cycles obtained for mandelbrot : " << avg_cycles_vec / avg_cycles_simd << std::endl; } } _mm_free(vX1); _mm_free(vX2); _mm_free(vY); _mm_free(vY1); _mm_free(vY2); }
void sgemm( int m, int n, float *A, float *C ) { __m128 a; __m128 b; __m128 c; int i, j, k, l; int mod = m%4; int end = m/4*4; int total = n*m; float num[4]; float* A_address; for( i = 0; i < end; i +=4 ){ for( k = 0; k < m; k++ ) { c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_loadu_ps(A + i + j); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); } _mm_storeu_ps(C + i + k*m, c); } }//Looks about right to me for a matrix where m is divisible by 4. if (mod != 0){ if (mod == 3){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 3; l ++){ *(C + i + k*m + l) = num[l]; } } } } else if (mod == 2){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 2; l ++){ *(C + i + k*m + l) = num[l]; } } } } else if (mod == 1){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address), 0, 0, 0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 1; l ++){ *(C + i + k*m + l) = num[l]; } } } } } }
//---------------------------------------------------------------- // Transforms the AABB vertices to screen space once every frame // Also performs a coarse depth pre-test //---------------------------------------------------------------- PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary) { // w ends up being garbage, but it doesn't matter - we ignore it anyway. __m128 vCenter = _mm_loadu_ps(&mBBCenter.x); __m128 vHalf = _mm_loadu_ps(&mBBHalf.x); __m128 vMin = _mm_sub_ps(vCenter, vHalf); __m128 vMax = _mm_add_ps(vCenter, vHalf); // transforms __m128 xRow[2], yRow[2], zRow[2]; xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0]; xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0]; yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1]; yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1]; zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2]; zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2]; __m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0)); __m128 screenMin = _mm_set1_ps(FLT_MAX); __m128 screenMax = _mm_set1_ps(-FLT_MAX); for(UINT i = 0; i < AABB_VERTICES; i++) { // Transform the vertex __m128 vert = cumulativeMatrix[3]; vert += xRow[sBBxInd[i]]; vert += yRow[sBByInd[i]]; vert += zRow[sBBzInd[i]]; // We have inverted z; z is in front of near plane iff z <= w. __m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz __m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww __m128 zIn = _mm_cmple_ps(vertZ, vertW); zAllIn = _mm_and_ps(zAllIn, zIn); // project xformedPos[i] = _mm_div_ps(vert, vertW); // update bounds screenMin = _mm_min_ps(screenMin, xformedPos[i]); screenMax = _mm_max_ps(screenMax, xformedPos[i]); } // if any of the verts are z-clipped, we (conservatively) say the box is in if(_mm_movemask_ps(zAllIn) != 0xf) return ePT_VISIBLE; // Clip against screen bounds screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX)); screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX)); // Quick rejection test if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin))) return ePT_INVISIBLE; // Prepare integer bounds __m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY __m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY); __m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3); __m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa); // Traverse all 8x8 blocks covered by 2d screen-space BBox; // if we know for sure that this box is behind the geometry we know is there, // we can stop. int rX0 = minMaxXYis.m128i_i32[0]; int rY0 = minMaxXYis.m128i_i32[1]; int rX1 = minMaxXYis.m128i_i32[2]; int rY1 = minMaxXYis.m128i_i32[3]; __m128 anyCloser = _mm_setzero_ps(); for(int by = rY0; by <= rY1; by++) { const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE); // If for any 8x8 block, maxZ is not less than (=behind) summarized // min Z, box might be visible. for(int bx = rX0; bx <= rX1; bx++) { anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx]))); } if(_mm_movemask_ps(anyCloser)) { return ePT_UNSURE; // okay, box might be in } } // If we get here, we know for sure that the box is fully behind the stuff in the // depth buffer. return ePT_INVISIBLE; }
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) { assert(vertices.size() % 16 == 0); // Simple k-means clustering by normal direction to improve backface culling efficiency std::vector<__m128> quadNormals; for (auto i = 0; i < vertices.size(); i += 4) { auto v0 = vertices[i + 0]; auto v1 = vertices[i + 1]; auto v2 = vertices[i + 2]; auto v3 = vertices[i + 3]; quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); } std::vector<__m128> centroids; std::vector<uint32_t> centroidAssignment; centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); centroidAssignment.resize(vertices.size() / 4); bool anyChanged = true; for (int iter = 0; iter < 10 && anyChanged; ++iter) { anyChanged = false; for (auto j = 0; j < quadNormals.size(); ++j) { __m128 normal = quadNormals[j]; __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity()); int bestCentroid = -1; for (int k = 0; k < centroids.size(); ++k) { __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); if (_mm_comige_ss(distance, bestDistance)) { bestDistance = distance; bestCentroid = k; } } if (centroidAssignment[j] != bestCentroid) { centroidAssignment[j] = bestCentroid; anyChanged = true; } } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = _mm_setzero_ps(); } for (int j = 0; j < quadNormals.size(); ++j) { int k = centroidAssignment[j]; centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = normalize(centroids[k]); } } std::vector<__m128> orderedVertices; for (int k = 0; k < centroids.size(); ++k) { for (int j = 0; j < vertices.size() / 4; ++j) { if (centroidAssignment[j] == k) { orderedVertices.push_back(vertices[4 * j + 0]); orderedVertices.push_back(vertices[4 * j + 1]); orderedVertices.push_back(vertices[4 * j + 2]); orderedVertices.push_back(vertices[4 * j + 3]); } } } auto occluder = std::make_unique<Occluder>(); __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); __m128 scalingX = _mm_set1_ps(2047.0f); __m128 scalingY = _mm_set1_ps(2047.0f); __m128 scalingZ = _mm_set1_ps(1023.0f); __m128 half = _mm_set1_ps(0.5f); for (size_t i = 0; i < orderedVertices.size(); i += 16) { for (auto j = 0; j < 4; ++j) { // Transform into [0,1] space relative to bounding box __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); // Transpose into [xxxx][yyyy][zzzz][wwww] _MM_TRANSPOSE4_PS(v0, v1, v2, v3); // Scale and truncate to int v0 = _mm_fmadd_ps(v0, scalingX, half); v1 = _mm_fmadd_ps(v1, scalingY, half); v2 = _mm_fmadd_ps(v2, scalingZ, half); __m128i X = _mm_cvttps_epi32(v0); __m128i Y = _mm_cvttps_epi32(v1); __m128i Z = _mm_cvttps_epi32(v2); // Pack to 11/11/10 format __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z)); occluder->m_vertexData.push_back(XYZ); } } occluder->m_refMin = refMin; occluder->m_refMax = refMax; __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity()); __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity()); for (size_t i = 0; i < orderedVertices.size(); ++i) { min = _mm_min_ps(vertices[i], min); max = _mm_max_ps(vertices[i], max); } // Set W = 1 - this is expected by frustum culling code min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); occluder->m_boundsMin = min; occluder->m_boundsMax = max; occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); return occluder; }
float t[4]; uint32_t offset = 0; float const d[2] = { line[2] - line[0], line[3] - line[1], }; float const length_inv = 1.0f/sqrtf(d[0]*d[0] + d[1]*d[1]); float const n[2] = { d[1]*length_inv, -d[0]*length_inv, }; float const distance = line[0]*n[0] + line[1]*n[1]; /* TODO: investigate integer registers */ __m128 const distance_4 = _mm_set1_ps(distance); __m128 const n0_4 = _mm_set1_ps(n[0]); __m128 const n1_4 = _mm_set1_ps(n[1]); __m128 const mask1_4 = _mm_set1_ps(1); __m128 const mask2_4 = _mm_set1_ps(2); __m128 const shift_4 = _mm_setr_ps(1, 3, 9, 27); /* process cell ids */ for (uint32_t ii = cells_offset; ii < cells_count; ++ii) { uint32_t const id = cells[ii]; if (id > dims[0]*dims[1]*dims[2]) { printf("big id %u\n", id); assert(0); } /* get coordinates of cell */ /* cell grid is one less than data grid */ uint32_t const i = id%dims[0]; uint32_t const j = (id/dims[0])%dims[1]; uint32_t const k = id/(dims[0]*dims[1]);
void sgemm( int m, int n, float *A, float *C ) { __m128 a; __m128 a1; __m128 a2; __m128 a3; __m128 a4; __m128 b; __m128 b1; __m128 b2; __m128 b3; __m128 b4; __m128 b5; __m128 b6; __m128 b7; __m128 b8; __m128 b9; __m128 b10; __m128 b11; __m128 b12; __m128 b13; __m128 b14; __m128 b15; __m128 b16; __m128 c; __m128 c1; __m128 c2; __m128 c3; __m128 c4; int i, j, k, l; int mod = m%4; int end = m/4 * 4; int total = n*m; float num[4]; float* A_address; float* C_address; int m4 = 4 * m; int m3 = 3 * m; int m2 = 2 * m; int end1 = total/m4 * m4; for( i = 0; i < end; i += 4 ){ for( k = 0; k < end; k += 4 ) { c1 = _mm_setzero_ps(); c2 = _mm_setzero_ps(); c3 = _mm_setzero_ps(); c4 = _mm_setzero_ps(); float* A_address1 = A + i; float* A_address2 = A + k; float* A_address21 = A + k + 1; for( j = 0; j < end1; j += m4, A_address1 += m4, A_address2 += m4, A_address21 += m4){ a1 = _mm_loadu_ps(A_address1); a2 = _mm_loadu_ps(A_address1 + m); a3 = _mm_loadu_ps(A_address1 + m2); a4 = _mm_loadu_ps(A_address1 + m3); b1 = _mm_load1_ps(A_address2); b2 = _mm_load1_ps(A_address2 + m); b3 = _mm_load1_ps(A_address2 + m2); b4 = _mm_load1_ps(A_address2 + m3); b5 = _mm_load1_ps(A_address21); b6 = _mm_load1_ps(A_address21 + m); b7 = _mm_load1_ps(A_address21 + m2); b8 = _mm_load1_ps(A_address21 + m3); b9 = _mm_load1_ps(A + k + 2 + j); b10 = _mm_load1_ps(A + k + 2 + j + m); b11 = _mm_load1_ps(A + k + 2 + j + m2); b12 = _mm_load1_ps(A + k + 2 + j + m3); b13 = _mm_load1_ps(A + k + 3 + j); b14 = _mm_load1_ps(A + k + 3 + j + m); b15 = _mm_load1_ps(A + k + 3 + j + m2); b16 = _mm_load1_ps(A + k + 3 + j + m3); c1 = _mm_add_ps(c1, _mm_mul_ps(a1, b1)); c1 = _mm_add_ps(c1, _mm_mul_ps(a2, b2)); c1 = _mm_add_ps(c1, _mm_mul_ps(a3, b3)); c1 = _mm_add_ps(c1, _mm_mul_ps(a4, b4)); c2 = _mm_add_ps(c2, _mm_mul_ps(a1, b5)); c2 = _mm_add_ps(c2, _mm_mul_ps(a2, b6)); c2 = _mm_add_ps(c2, _mm_mul_ps(a3, b7)); c2 = _mm_add_ps(c2, _mm_mul_ps(a4, b8)); c3 = _mm_add_ps(c3, _mm_mul_ps(a1, b9)); c3 = _mm_add_ps(c3, _mm_mul_ps(a2, b10)); c3 = _mm_add_ps(c3, _mm_mul_ps(a3, b11)); c3 = _mm_add_ps(c3, _mm_mul_ps(a4, b12)); c4 = _mm_add_ps(c4, _mm_mul_ps(a1, b13)); c4 = _mm_add_ps(c4, _mm_mul_ps(a2, b14)); c4 = _mm_add_ps(c4, _mm_mul_ps(a3, b15)); c4 = _mm_add_ps(c4, _mm_mul_ps(a4, b16)); } for( j = end1; j < total; j += m){ a = _mm_loadu_ps(A + i + j); b1 = _mm_load1_ps(A + k + j); b2 = _mm_load1_ps(A + k + 1 + j); b3 = _mm_load1_ps(A + k + 2 + j); b4 = _mm_load1_ps(A + k + 3 + j); c1 = _mm_add_ps(c1, _mm_mul_ps(a, b1)); c2 = _mm_add_ps(c2, _mm_mul_ps(a, b2)); c3 = _mm_add_ps(c3, _mm_mul_ps(a, b3)); c4 = _mm_add_ps(c4, _mm_mul_ps(a, b4)); } _mm_storeu_ps(C + i + (k)*m, c1); _mm_storeu_ps(C + i + (k+1)*m, c2); _mm_storeu_ps(C + i + (k+2)*m, c3); _mm_storeu_ps(C + i + (k+3)*m, c4); } for(k = end; k < m; k++){ float* A_address1 = A + i; float* A_address2 = A + k; c = _mm_setzero_ps(); for( j = 0; j < end1; j += m4, A_address1 += m4, A_address2 += m4){ a1 = _mm_loadu_ps(A_address1); a2 = _mm_loadu_ps(A + i + j + m); a3 = _mm_loadu_ps(A + i + j + m2); a4 = _mm_loadu_ps(A + i + j + m3); b1 = _mm_load1_ps(A_address2); b2 = _mm_load1_ps(A + k + j + m); b3 = _mm_load1_ps(A + k + j + m2); b4 = _mm_load1_ps(A + k + j + m3); c = _mm_add_ps(c, _mm_mul_ps(a1, b1)); c = _mm_add_ps(c, _mm_mul_ps(a2, b2)); c = _mm_add_ps(c, _mm_mul_ps(a3, b3)); c = _mm_add_ps(c, _mm_mul_ps(a4, b4)); } for( j = end1; j < total; j += m){ a = _mm_loadu_ps(A + i + j); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); } _mm_storeu_ps(C + i + k*m, c); } } if (mod != 0){ if (mod == 3){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 3; l ++){ *(C + i + k*m + l) = num[l]; } } } } else if (mod == 2){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 2; l ++){ *(C + i + k*m + l) = num[l]; } } } } else if (mod == 1){ for( i = end; i < m; i +=4 ){ for( k = 0; k < m; k++ ) { A_address = A + i; c = _mm_setzero_ps(); for( j = 0; j < total; j += m ) { a = _mm_setr_ps(*(A_address), 0, 0, 0); b = _mm_load1_ps(A + k + j); c = _mm_add_ps(c, _mm_mul_ps(a, b)); A_address += m; } _mm_storeu_ps(num, c); for (l = 0; l < 1; l ++){ *(C + i + k*m + l) = num[l]; } } } } } }
void CAEUtil::ClampArray(float *data, uint32_t count) { #if !defined(HAVE_SSE) || !defined(__SSE__) for (uint32_t i = 0; i < count; ++i) data[i] = SoftClamp(data[i]); #else const __m128 c1 = _mm_set_ps1(27.0f); const __m128 c2 = _mm_set_ps1(27.0f + 9.0f); /* work around invalid alignment */ while (((uintptr_t)data & 0xF) && count > 0) { data[0] = SoftClamp(data[0]); ++data; --count; } uint32_t even = count & ~0x3; for (uint32_t i = 0; i < even; i+=4, data+=4) { /* tanh approx clamp */ __m128 dt = _mm_load_ps(data); __m128 tmp = _mm_mul_ps(dt, dt); *(__m128*)data = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); } if (even != count) { uint32_t odd = count - even; if (odd == 1) data[0] = SoftClamp(data[0]); else { __m128 dt; __m128 tmp; __m128 out; if (odd == 2) { /* tanh approx clamp */ dt = _mm_setr_ps(data[0], data[1], 0, 0); tmp = _mm_mul_ps(dt, dt); out = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); data[0] = ((float*)&out)[0]; data[1] = ((float*)&out)[1]; } else { /* tanh approx clamp */ dt = _mm_setr_ps(data[0], data[1], data[2], 0); tmp = _mm_mul_ps(dt, dt); out = _mm_div_ps( _mm_mul_ps( dt, _mm_add_ps(c1, tmp) ), _mm_add_ps(c2, tmp) ); data[0] = ((float*)&out)[0]; data[1] = ((float*)&out)[1]; data[2] = ((float*)&out)[2]; } } } #endif }