int QpskSymbolMapper::process(Buffer* const dataIn, Buffer* dataOut)
{
    PDEBUG("QpskSymbolMapper::process"
            "(dataIn: %p, dataOut: %p)\n",
            dataIn, dataOut);

    dataOut->setLength(dataIn->getLength() * 4 * 2 * sizeof(float));   // 4 output complex symbols per input byte
#ifdef __SSE__
    const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
    __m128* out = reinterpret_cast<__m128*>(dataOut->getData());

    if (dataIn->getLength() % (d_carriers / 4) != 0) {
        fprintf(stderr, "%zu (input size) %% (%zu (carriers) / 4) != 0\n",
                dataIn->getLength(), d_carriers);
        throw std::runtime_error(
                "QpskSymbolMapper::process input size not valid!");
    }

    const static __m128 symbols[16] = {
        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps( M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2),
        _mm_setr_ps(-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2)
    };
    size_t inOffset = 0;
    size_t outOffset = 0;
    uint8_t tmp = 0;
    for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) {
        for (size_t j = 0; j < d_carriers / 8; ++j) {
            tmp =  (in[inOffset] & 0xc0) >> 4;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6;
            out[outOffset] = symbols[tmp];
            tmp =  (in[inOffset] & 0x30) >> 2;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4;
            out[outOffset + 1] = symbols[tmp];
            tmp =  (in[inOffset] & 0x0c);
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2;
            out[outOffset + 2] = symbols[tmp];
            tmp =  (in[inOffset] & 0x03) << 2;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x03);
            out[outOffset + 3] = symbols[tmp];
            ++inOffset;
            outOffset += 4;
        }
        inOffset += d_carriers / 8;
    }
#else // !__SSE__
    const uint8_t* in = reinterpret_cast<const uint8_t*>(dataIn->getData());
    float* out = reinterpret_cast<float*>(dataOut->getData());
    if (dataIn->getLength() % (d_carriers / 4) != 0) {
        throw std::runtime_error(
                "QpskSymbolMapper::process input size not valid!");
    }
    if (dataOut->getLength() / sizeof(float) != dataIn->getLength() * 4 * 2) {    // 4 output complex symbols per input byte
        throw std::runtime_error(
                "QpskSymbolMapper::process output size not valid!");
    }

    const static float symbols[16][4] = {
        { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
        { M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
        { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
        { M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
        { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
        { M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
        { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
        { M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
        {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
        {-M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
        {-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2,  M_SQRT1_2},
        {-M_SQRT1_2,- M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2},
        {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
        {-M_SQRT1_2,  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2},
        {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2},
        {-M_SQRT1_2,- M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2}
    };
    size_t inOffset = 0;
    size_t outOffset = 0;
    uint8_t tmp;
    for (size_t i = 0; i < dataIn->getLength(); i += d_carriers / 4) {
        for (size_t j = 0; j < d_carriers / 8; ++j) {
            tmp =  (in[inOffset] & 0xc0) >> 4;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0xc0) >> 6;
            memcpy(&out[outOffset], symbols[tmp], sizeof(float) * 4);
            tmp =  (in[inOffset] & 0x30) >> 2;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x30) >> 4;
            memcpy(&out[outOffset + 4], symbols[tmp], sizeof(float) * 4);
            tmp =  (in[inOffset] & 0x0c);
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x0c) >> 2;
            memcpy(&out[outOffset + 8], symbols[tmp], sizeof(float) * 4);
            tmp =  (in[inOffset] & 0x03) << 2;
            tmp |= (in[inOffset + (d_carriers / 8)] & 0x03);
            memcpy(&out[outOffset + 12], symbols[tmp], sizeof(float) * 4);
            ++inOffset;
            outOffset += 4*4;
        }
        inOffset += d_carriers / 8;
    }
#endif // __SSE__

    return 1;
}
Esempio n. 2
0
static float ks_donor_acceptor(const float* xyz, const float* hcoords,
                               const int* nco_indices, int donor, int acceptor)
{
  /* Conpute the Kabsch-Sander hydrogen bond energy between two residues
     in a single conformation.

     Parameters
     ----------
     xyz : array, shape=(n_atoms, 3)
         All of the atoms in this frame
     nhco0 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in one residue.
     nhco1 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in the other residue.
     donor : int
         Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we
         look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor.

     Returns
     -------
     energy : float
         The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5
         is considered significant.
  */
  float energy;
  __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno;
  __m128 coupling, recip_sqrt, one;
  one = _mm_set1_ps(1.0f);

  /* 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A) */
  coupling = _mm_setr_ps(-2.7888f, -2.7888f, 2.7888f, 2.7888f);
  r_n = load_float3(xyz + 3*nco_indices[3*donor]);
  r_h = load_float3(hcoords + 3*donor);
  r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]);
  r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]);

  /*
  printf("Donor Index %d\n", donor);
  printf("Acceptor Index %d\n", acceptor);
  printf("N index %d\n", 3*nco_indices[3*donor + 0]);
  printf("C index %d\n", 3*nco_indices[3*acceptor + 1]);
  printf("O index %d\n", 3*nco_indices[3*acceptor + 2]);
  printf("\nrN ");
  printf_m128(r_n);
  printf("rH ");
  printf_m128(r_h);
  printf("rC ");
  printf_m128(r_c);
  printf("rO ");
  printf_m128(r_o);
  */

  r_ho = _mm_sub_ps(r_h, r_o);
  r_hc = _mm_sub_ps(r_h, r_c);
  r_nc = _mm_sub_ps(r_n, r_c);
  r_no = _mm_sub_ps(r_n, r_o);

  /* compute all four dot products (each of the squared distances), and then */
  /* pack them into a single float4 using three shuffles. */
  d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps2(r_ho, r_ho, 0xF3), _mm_dp_ps2(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _mm_shuffle_ps(_mm_dp_ps2(r_hc, r_hc, 0xF3), _mm_dp_ps2(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _MM_SHUFFLE(2,0,2,0));

  /* rsqrt_ps is really not that accurate... */
  recip_sqrt = _mm_div_ps(one, _mm_sqrt_ps(d2_honchcno));
  energy = _mm_cvtss_f32(_mm_dp_ps2(coupling, recip_sqrt, 0xFF));
  // energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF));
  return (energy < -9.9f ? -9.9f : energy);
}
Esempio n. 3
0
 inline vector4f(float f0, float f1, float f2, float f3) : m_value(_mm_setr_ps(f0, f1, f2, f3)) {
 }
Esempio n. 4
0
	static inline Simd set(float a, float b, float c, float d, float e, float f, float g, float h) {
		Simd res;
		res.reg[0] = _mm_setr_ps(a, b, c, d);
		res.reg[1] = _mm_setr_ps(e, f, g, h);
		return res;
	}
Esempio n. 5
0
	static inline Simd set(float x, float y, float z, float w) {
		Simd res;
		res.reg = _mm_setr_ps(x, y, z, w);
		return res;
	}
Esempio n. 6
0
namespace Demi
{
    const ArrayFloat MathlibSSE2::HALF       = _mm_set_ps1( 0.5f );
    const ArrayFloat MathlibSSE2::ONE        = _mm_set_ps1( 1.0f );
    const ArrayFloat MathlibSSE2::THREE      = _mm_set_ps1( 3.0f );
    const ArrayFloat MathlibSSE2::NEG_ONE    = _mm_set_ps1( -1.0f );
    const ArrayFloat MathlibSSE2::fEpsilon   = _mm_set_ps1( 1e-6f );
    const ArrayFloat MathlibSSE2::fSqEpsilon = _mm_set_ps1( 1e-12f );
    const ArrayFloat MathlibSSE2::OneMinusEpsilon= _mm_set_ps1( 1.0f - 1e-6f );
    const ArrayFloat MathlibSSE2::FLOAT_MIN  = _mm_set_ps1( std::numeric_limits<float>::min() );
    const ArrayFloat MathlibSSE2::SIGN_MASK  = _mm_set_ps1( -0.0f );
    const ArrayFloat MathlibSSE2::INFINITEA  = _mm_set_ps1( std::numeric_limits<float>::infinity() );
    const ArrayFloat MathlibSSE2::MAX_NEG    = _mm_set_ps1( -std::numeric_limits<float>::max() );
    const ArrayFloat MathlibSSE2::MAX_POS    = _mm_set_ps1( std::numeric_limits<float>::max() );
    const ArrayFloat MathlibSSE2::LAST_AFFINE_COLUMN = _mm_setr_ps( 0, 0, 0, 1 );

    static const float _PI = float( 4.0 * atan( 1.0 ) );
    //We can't use Math::fDeg2Rad & Math::fRad2Deg directly because
    //it's not guaranteed to have been initialized first
    const ArrayFloat MathlibSSE2::PI         = _mm_set_ps1( _PI );
    const ArrayFloat MathlibSSE2::TWO_PI     = _mm_set_ps1( 2.0f * _PI );
    const ArrayFloat MathlibSSE2::fDeg2Rad   = _mm_set_ps1( _PI / 180.0f );
    const ArrayFloat MathlibSSE2::fRad2Deg   = _mm_set_ps1( 180.0f / _PI );

    const ArrayFloat MathlibSSE2::ONE_DIV_2PI= _mm_set_ps1( 1.0f / (2.0f * _PI) );

    
    ArrayFloat MathlibSSE2::Sin4( ArrayFloat x )
    {
        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayFloat integralPart;
        x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI );

        return sin_ps( x );
    }
    
    ArrayFloat MathlibSSE2::Cos4( ArrayFloat x )
    {
        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayFloat integralPart;
        x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI );

        return cos_ps( x );
    }
    
    void MathlibSSE2::SinCos4( ArrayFloat x, ArrayFloat &outSin, ArrayFloat &outCos )
    {
        // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap
        // between cos & sin depending on which quadrant it fell:
        // Quadrant | sin     |  cos
        // n = 0 ->  sin( x ),  cos( x )
        // n = 1 ->  cos( x ), -sin( x )
        // n = 2 -> -sin( x ), -cos( x )
        // n = 3 -> -sin( x ),  sin( x )
        // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS:
        // Good to the Last Bit
        // K. C. Ng and themembers of the FP group of SunPro
        // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf

        // -- Perhaps we can leave this to GSoC students? --

        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayFloat integralPart;
        x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI );

        sincos_ps( x, &outSin, &outCos );
    }

    const ArrayFloat BooleanMask4::mMasks[NUM_MASKS] =
    {
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )),//MASK_NONE
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0xffffffff )),//MASK_X
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0xffffffff, 0x00000000 )),//MASK_Y
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0x00000000, 0xffffffff, 0xffffffff )),//MASK_XY
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0x00000000 )),//MASK_Z
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0xffffffff )),//MASK_XZ
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0xffffffff, 0x00000000 )),//MASK_YZ
        _mm_castsi128_ps(_mm_set_epi32( 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff )),//MASK_XYZ
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 )),//MASK_W
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0xffffffff )),//MASK_XW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0x00000000 )),//MASK_YW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0xffffffff )),//MASK_XYW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )),//MASK_ZW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0xffffffff )),//MASK_XZW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 )),//MASK_YZW
        _mm_castsi128_ps(_mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff )) //MASK_XYZW
    };
}
Esempio n. 7
0
void
fht_SSE2(FLOAT * fz, int n)
{
    const FLOAT *tri = costab;
    int     k4;
    FLOAT  *fi, *gi;
    FLOAT const *fn;

    n <<= 1;            /* to get BLKSIZE, because of 3DNow! ASM routine */
    fn = fz + n;
    k4 = 4;
    do {
        FLOAT   s1, c1;
        int     i, k1, k2, k3, kx;
        kx = k4 >> 1;
        k1 = k4;
        k2 = k4 << 1;
        k3 = k2 + k1;
        k4 = k2 << 1;
        fi = fz;
        gi = fi + kx;
        do {
            FLOAT   f0, f1, f2, f3;
            f1 = fi[0] - fi[k1];
            f0 = fi[0] + fi[k1];
            f3 = fi[k2] - fi[k3];
            f2 = fi[k2] + fi[k3];
            fi[k2] = f0 - f2;
            fi[0] = f0 + f2;
            fi[k3] = f1 - f3;
            fi[k1] = f1 + f3;
            f1 = gi[0] - gi[k1];
            f0 = gi[0] + gi[k1];
            f3 = SQRT2 * gi[k3];
            f2 = SQRT2 * gi[k2];
            gi[k2] = f0 - f2;
            gi[0] = f0 + f2;
            gi[k3] = f1 - f3;
            gi[k1] = f1 + f3;
            gi += k4;
            fi += k4;
        } while (fi < fn);
        c1 = tri[0];
        s1 = tri[1];
        for (i = 1; i < kx; i++) {
            __m128 v_s2;
            __m128 v_c2;
            __m128 v_c1;
            __m128 v_s1;
            FLOAT   c2, s2, s1_2 = s1+s1;
            c2 = 1 - s1_2 * s1;
            s2 = s1_2 * c1;
            fi = fz + i;
            gi = fz + k1 - i;
            v_c1 = _mm_set_ps1(c1);
            v_s1 = _mm_set_ps1(s1);
            v_c2 = _mm_set_ps1(c2);
            v_s2 = _mm_set_ps1(s2);
            {
                static const vecfloat_union sign_mask = {{0x80000000,0,0,0}};
                v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */
            }
            {
                static const vecfloat_union sign_mask = {{0,0x80000000,0,0}};
                v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */
            }
            {
                static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}};
                v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */
            }
            do {
                __m128 p, q, r;

                q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/
                p = _mm_mul_ps(_mm_set_ps1(s2), q);              /* P := s2 * Q */
                q = _mm_mul_ps(v_c2, q);                         /* Q := c2 * Q */
                q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2));  /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */
                p = _mm_add_ps(p, q);
                
                r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]);   /* R := {gi_0,gi_k2,fi_0,fi_k2} */
                q = _mm_sub_ps(r, p);                            /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */
                r = _mm_add_ps(r, p);                            /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */
                p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0));  /* P := {q0,q2,r0,r2} */
                p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0));  /* P := {q0,r0,q2,r2} */
                q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1));  /* Q := {q1,q3,r1,r3} */
                r = _mm_mul_ps(v_c1, q);
                q = _mm_mul_ps(v_s1, q);
                q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3));  /* Q := {q3,q2,q1,q0} */
                q = _mm_add_ps(q, r);

                store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]);
                store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]);

                gi += k4;
                fi += k4;
            } while (fi < fn);
            c2 = c1;
            c1 = c2 * tri[0] - s1 * tri[1];
            s1 = c2 * tri[1] + s1 * tri[0];
        }
        tri += 2;
    } while (k4 < n);
}
Esempio n. 8
0
mlib_status
__mlib_SignalLimit_F32S_F32S(
    mlib_f32 *dst,
    const mlib_f32 *src,
    const mlib_f32 *low,
    const mlib_f32 *high,
    mlib_s32 n)
{
	mlib_s32 i, count;
	mlib_f32 tl0, tl1;
	mlib_f32 th0, th1;
	mlib_f32 x, x1;
	__m128 t_low, t_high;
	__m128 dx;
	mlib_f32 *psrc = (mlib_f32 *)src;
	mlib_f32 *pdst = (mlib_f32 *)dst;
	mlib_s32 samples = 2 * n;

	tl0 = low[0];
	th0 = high[0];
	tl1 = low[1];
	th1 = high[1];

	if ((tl0 > th0) || (tl1 > th1) || (n <= 0))
		return (MLIB_FAILURE);

	count = (16 - ((mlib_addr)psrc & 15)) >> 2;
	if (count > samples) count = samples;

	for (i = 0; i < count - 1; i += 2) {
		x = (*psrc++);
		x = (x < tl0) ? tl0 : x;
		x = (x >= th0) ? th0 : x;
		(*pdst++) = x;
		x1 = (*psrc++);
		x1 = (x1 < tl1) ? tl1 : x1;
		x1 = (x1 >= th1) ? th1 : x1;
		(*pdst++) = x1;
	}

	if (count & 1) {
		x = (*psrc++);
		x = (x < tl0) ? tl0 : x;
		x = (x >= th0) ? th0 : x;
		(*pdst++) = x;
	}

	if (count & 1) {
		t_low = _mm_setr_ps(tl1, tl0, tl1, tl0);
		t_high = _mm_setr_ps(th1, th0, th1, th0);
	} else {
		t_low = _mm_setr_ps(tl0, tl1, tl0, tl1);
		t_high = _mm_setr_ps(th0, th1, th0, th1);
	}

	samples -= count;

	if ((mlib_addr)pdst & 15) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < samples >> 2; i++) {
			dx = _mm_load_ps(psrc + 4 * i);
			dx = _mm_max_ps(dx, t_low);
			dx = _mm_min_ps(dx, t_high);
			_mm_storeu_ps(pdst + 4 * i, dx);
		}
	} else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < samples >> 2; i++) {
			dx = _mm_load_ps(psrc + 4 * i);
			dx = _mm_max_ps(dx, t_low);
			dx = _mm_min_ps(dx, t_high);
			_mm_store_ps(pdst + 4 * i, dx);
		}
	}

	i <<= 2;
	psrc += i;
	pdst += i;

	if (count & 1 && i < samples) {
		x = (*psrc++);
		x = (x < tl1) ? tl1 : x;
		x = (x >= th1) ? th1 : x;
		(*pdst++) = x;

		i ++;
	}

	for (; i < samples - 1; i += 2) {
		x = (*psrc++);
		x = (x < tl0) ? tl0 : x;
		x = (x >= th0) ? th0 : x;
		(*pdst++) = x;
		x1 = (*psrc++);
		x1 = (x1 < tl1) ? tl1 : x1;
		x1 = (x1 >= th1) ? th1 : x1;
		(*pdst++) = x1;
	}

	return (MLIB_SUCCESS);
}
Esempio n. 9
0
void init_sse_data()
{
#ifdef HAVE_SSE
  if (A_s == 0) {
    posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );	  
    A_s[1]  = _mm_setr_ps ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0 );	  
    A_s[2]  = _mm_setr_ps ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0 );	  
    A_s[3]  = _mm_setr_ps ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0 );	  
    A_s[4]  = _mm_setr_ps ( -0.5,  1.0, -0.5, 0.0  );		  
    A_s[5]  = _mm_setr_ps (  0.0, -2.0,  1.5, 0.0  );		  
    A_s[6]  = _mm_setr_ps (  0.5,  1.0, -1.5, 0.0  );		  
    A_s[7]  = _mm_setr_ps (  0.0,  0.0,  0.5, 0.0  );		  
    A_s[8]  = _mm_setr_ps (  1.0, -1.0,  0.0, 0.0  );		  
    A_s[9]  = _mm_setr_ps ( -2.0,  3.0,  0.0, 0.0  );		  
    A_s[10] = _mm_setr_ps (  1.0, -3.0,  0.0, 0.0  );		  
    A_s[11] = _mm_setr_ps (  0.0,  1.0,  0.0, 0.0  );                  
  }
                 
#endif
#ifdef HAVE_SSE2
  if (A_d == 0) {
    posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24));
    A_d[ 0] = _mm_setr_pd (  3.0/6.0, -1.0/6.0 );	   
    A_d[ 1] = _mm_setr_pd (  1.0/6.0, -3.0/6.0 );	   
    A_d[ 2] = _mm_setr_pd ( -6.0/6.0,  3.0/6.0 );	   
    A_d[ 3] = _mm_setr_pd (  4.0/6.0,  0.0/6.0 );	   
    A_d[ 4] = _mm_setr_pd (  3.0/6.0, -3.0/6.0 );	   
    A_d[ 5] = _mm_setr_pd (  1.0/6.0,  3.0/6.0 );	   
    A_d[ 6] = _mm_setr_pd (  0.0/6.0,  1.0/6.0 );	   
    A_d[ 7] = _mm_setr_pd (  0.0/6.0,  0.0/6.0 );	   
    A_d[ 8] = _mm_setr_pd ( -0.5,  0.0 );		   
    A_d[ 9] = _mm_setr_pd ( -0.5,  1.0 );		   
    A_d[10] = _mm_setr_pd (  1.5,  0.0 );		   
    A_d[11] = _mm_setr_pd (  0.0, -2.0 );		   
    A_d[12] = _mm_setr_pd ( -1.5,  0.0 );		   
    A_d[13] = _mm_setr_pd (  0.5,  1.0 );		   
    A_d[14] = _mm_setr_pd (  0.5,  0.0 );		   
    A_d[15] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[16] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[17] = _mm_setr_pd (  1.0, -1.0 );		   
    A_d[18] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[19] = _mm_setr_pd ( -2.0,  3.0 );		   
    A_d[20] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[21] = _mm_setr_pd (  1.0, -3.0 );		   
    A_d[22] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[23] = _mm_setr_pd (  0.0,  1.0 );   
  }                
#endif
}
Esempio n. 10
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
Esempio n. 11
0
#include "AL/alc.h"
#include "alMain.h"
#include "alu.h"

#include "alSource.h"
#include "alAuxEffectSlot.h"
#include "mixer_defs.h"


static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
                                   const ALuint IrSize,
                                   ALfloat (*restrict Coeffs)[2],
                                   const ALfloat (*restrict CoeffStep)[2],
                                   ALfloat left, ALfloat right)
{
    const __m128 lrlr = _mm_setr_ps(left, right, left, right);
    __m128 coeffs, deltas, imp0, imp1;
    __m128 vals = _mm_setzero_ps();
    ALuint i;

    if((Offset&1))
    {
        const ALuint o0 = Offset&HRIR_MASK;
        const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;

        coeffs = _mm_load_ps(&Coeffs[0][0]);
        deltas = _mm_load_ps(&CoeffStep[0][0]);
        vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
        imp0 = _mm_mul_ps(lrlr, coeffs);
        coeffs = _mm_add_ps(coeffs, deltas);
        vals = _mm_add_ps(imp0, vals);
Esempio n. 12
0
int main()
{
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> dis(0, 255);

	size_t max_iter = 20;
	size_t array_size = 800;
	size_t vector_size = array_size*4;

	vfloat32 *vX1, *vX2, *vY , *vY1 , *vY2;
	std::vector<float> vec1(vector_size) , vec2(vector_size) , vecy(vector_size , 0.) , vecy1(vector_size,0.)
	, vecy2(vector_size, 0.);

	// SIMD vectors must be 16 bits aligned
	vX1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vX2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY  =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);

	vfloat32 vy = _mm_set_ps(0,0,0,0);

	int j = 0;

	// Initialize vectors and simd arrays
	for(size_t i = 0 ; i < array_size ; ++i)
	{
		float r1 = dis(gen) , r2 = dis(gen) , r3 = dis(gen) , r4 = dis(gen);
		float r5 = dis(gen) , r6 = dis(gen) , r7 = dis(gen) , r8 = dis(gen);

		vec1[j] = r1; vec1[j+1] = r2 ; vec1[j+2] = r3 ; vec1[j+3] = r4;
		vec2[j] = r5; vec2[j+1] = r6 ; vec2[j+2] = r7 ; vec2[j+3] = r8;

		vfloat32 vx1 = _mm_set_ps(r4 , r3 , r2 , r1  );
		vfloat32 vx2 = _mm_set_ps(r8 , r7 , r6 , r5  );

		_mm_store_ps((float*) &vX1[i], vx1);
		_mm_store_ps((float*) &vX2[i], vx2);
		_mm_store_ps((float*) &vY[i], vy);
		_mm_store_ps((float*) &vY1[i], vy);
		_mm_store_ps((float*) &vY2[i], vy);

		j +=4;
	}

	// test pour l'addition de vectors
	{
		auto start = std::chrono::steady_clock::now();
		vectoradd_simd(vX1,vX2,vY,array_size);
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;
		// std::cout << "vector addition time with simd: " << diff.count() << " s" << std::endl;

		start = std::chrono::steady_clock::now();
		std::transform( vec1.begin() , vec1.end() , vec2.begin() , vecy.begin() , std::plus<float>());

		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;
		// std::cout << "vector addition time without simd: " << diff1.count() << " s" << std::endl;

		j = 0;
		bool is_valid = true;
		for(size_t i = 0 ; i < array_size ; ++i)
		{
			float out[4] ;
			_mm_store_ps(out , vY[i]);

			if ( out[0] == vecy[j] && out[1] == vecy[j+1] && out[2] == vecy[j+2] && out[3] == vecy[j+3])
				{ j += 4;}
			else
			{
				is_valid = false;
				break;
			}
		}

		if(is_valid)
		{
			std::cout << "l'addition de vecteurs en simd est correcte" << std::endl;
			std::cout << "speedup obtained for vector addition with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else
		{
			std::cout << " l'addition de vecteurs end simd est incorrecte" << std::endl;
		}

		std::cout << "\n";
	}

	// test pour le dot product
	{
		auto start = std::chrono::steady_clock::now();
		vfloat32 sres = vectordot_simd(vX1 , vX2 , array_size);
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;
		// std::cout << "dot product time with simd: " << diff.count() << " s" << std::endl;

		start = std::chrono::steady_clock::now();
		float res = std::inner_product( vec1.begin() , vec1.end() , vec2.begin() , 0. );
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;
		// std::cout << "dot product time without simd: " << diff1.count() << " s" << std::endl;

		float out[4] ;
		_mm_store_ps( out , sres);

		if(  std::abs(out[0] - res ) < 0.01f )
		{
			std::cout << "le produit de vecteurs en simd est correct" << std::endl;
			std::cout << "speedup obtained for dot product with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else {std::cout << "le produit de vecteurs en simd est incorrect : " << out[0] << "  " << res << std::endl;}

		std::cout << "\n";
	}

	// test for 1D filtre with rotation without border check
	{
		auto start = std::chrono::steady_clock::now();
		float divide = 1./3. ;
		for(std::size_t i = 1 ; i < vector_size-1 ; ++i)
		{
			vecy1[i] = divide * ( vec1[i-1] + vec1[i] + vec1[i+1] );
		}
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;;

		start = std::chrono::steady_clock::now();
		vectoravg3_simd(vX1 , vY1 , array_size);
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;

		j = 4;
		bool is_valid = true;

		for(size_t i = 1 ; i < array_size-1 ; ++i)
		{
			float out[4] ;
			_mm_store_ps(out , vY1[i]);

			if ( is_valid == true && out[0] == vecy1[j] && out[1] == vecy1[j+1] && out[2] == vecy1[j+2] && out[3] == vecy1[j+3])
				{ j += 4;}
			else
			{
				is_valid = false;
				break;
			}
		}

		if(is_valid)
		{
			std::cout << "la filtre moyenneur en simd est correct" << std::endl;
			std::cout << "speedup obtained for average filter with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else
		{
			std::cout << "la filtre moyenneur en simd est incorrect" << std::endl;
		}

		std::cout << "\n";
	}

	bool valid_mandel = false;
	// test for mandelbrot
	{
		std::vector<float> mandel_test(4,0);
		std::vector<float> mandel_test1(4,0);
		std::vector<size_t> indx(4,0);
		vfloat32 mdt = _mm_set1_ps(0);
		vfloat32 mdt1 = _mm_set1_ps(0);

		mandel_test[0] = -0.70;
		mandel_test[1] = -0.80;
		mandel_test[2] = -0.90;
		mandel_test[3] = -1.00;

		mandel_test1[0] = +0.10;
		mandel_test1[1] = +0.30;
		mandel_test1[2] = +0.30;
		mandel_test1[3] = +0.40;

		mdt  = _mm_setr_ps(-1.00, -0.90, -0.80, -0.70);
		mdt1 = _mm_setr_ps(+0.40, +0.30, +0.30, +0.10);

		auto start = std::chrono::steady_clock::now();
		for(std::size_t i = 0 ; i < 4 ; ++i )
		{

			indx[i] = mandelbrot_scalar(mandel_test[i] , mandel_test1[i] , max_iter );
		}
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;;

		start = std::chrono::steady_clock::now();
		vuint32 res_mandel = mandelbrot_simd(mdt, mdt1 , max_iter);
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;

		unsigned int out[4] __attribute__((aligned(16))) ;

		__m128i* po = (__m128i*) &out[0] ;

		_mm_store_si128(po, res_mandel);

		bool v1 = false , v2 = false;

		if( indx[0] == 20 && indx[1] == 8 && indx[2] == 10 && indx[3] == 6 )
		{
			v1 = true;
			std::cout << "la fonction mandelbrot en scalaire est correcte" << std::endl;
		}
		else
		{
			std::cout << "la fonction mandelbrot en scalaire est incorrecte" << std::endl;
			std::cout << "le bon résultat est : 20 8 10 6 \n" << "vous avez obtenu : ";
			vec_display(indx,0);
		}


		if( out[3] == 20 && out[2] == 8 && out[1] == 10 && out[0] == 6 )
		{
			v2 = true;
			std::cout << "la fonction mandelbrot en SIMD est correcte" << std::endl;
		}
		else
		{
			std::cout << "la fonction mandelbrot en SIMD est incorrecte" << std::endl;
			std::cout << "le bon résultat est 20 8 10 6 \n" << "vous avez obtenu :  ";
			simd_display_i32(res_mandel);
		}


		if ( v1 && v2 )
		{
			std::cout << "speedup obtained for mandelbrot : " << diff1.count() / diff.count() << std::endl;
			valid_mandel = true;
		}
	}

	// test for mandelbrot function

	{
		if(valid_mandel)
		{

			std::cout << "\n-----------------------------" << std::endl;
			std::cout << "------ benchmandelbrot ------" << std::endl;
			std::cout << "-----------------------------\n" << std::endl;

			size_t h = SIZE , w = SIZE ;
			std::vector<size_t> indx(h*w,0);
			vfloat32 mdt = _mm_set1_ps(0);
			vfloat32 mdt1 = _mm_set1_ps(0);

			float a0 = -1.5 , a1 = +0.5;
			float b0 = -1.0 , b1 = +1.0;

			float avg_cycles_vec = 0;
			float avg_time_vec  = 0;

			size_t num_iter = 200;

			for(size_t i =0 ; i < num_iter ; ++i)
			{
				auto start = std::chrono::steady_clock::now();
				auto cycles_s = rdtsc();
				calc_mandelbrot_scalar( indx , h , w , a0 , a1 , b0 , b1  , max_iter );
				auto cycles_e = rdtsc();
				auto end = std::chrono::steady_clock::now();
				std::chrono::duration<double> diff1 = end-start;

				avg_time_vec += diff1.count() ;

				avg_cycles_vec += cycles_e - cycles_s;
			}

			avg_time_vec /= num_iter ;
			avg_cycles_vec /= num_iter ;

			std::cout << " mandelbrot vector time : " << avg_time_vec << std::endl;
			std::cout << " mandelbrot vector cycles time : " << avg_cycles_vec << std::endl;

			vuint32 **Simd_indx = (vuint32**)_mm_malloc ((size_t)( h*sizeof(vuint32*)), 16);
			if (Simd_indx)
			{
				for (size_t i = 0; i < w; i++)
				{
					Simd_indx[i] = (vuint32*) _mm_malloc ((size_t) (w*sizeof(vuint32)), 16);
				}
			}

			float avg_cycles_simd = 0;
			float avg_time_simd  = 0;

			for(size_t i = 0 ; i < num_iter ; ++i)
			{
				auto start = std::chrono::steady_clock::now();
				auto cycles_s = rdtsc();
				calc_mandelbrot_simd( Simd_indx , h , w , a0 , a1 , b0 , b1  , max_iter );
				auto cycles_e = rdtsc();
				auto end = std::chrono::steady_clock::now();
				std::chrono::duration<double> diff = end-start;

				avg_time_simd += diff.count() ;
				avg_cycles_simd += cycles_e - cycles_s;
			}

			avg_time_simd /= num_iter ;
			avg_cycles_simd /= num_iter ;

			std::cout << " mandelbrot SIMD time : " << avg_time_simd << std::endl;
			std::cout << " mandelbrot SIMD cycles time : " << avg_cycles_simd << std::endl;

			std::cout << "speedup obtained for mandelbrot : " << avg_time_vec / avg_time_simd << std::endl;
			std::cout << "speedup in cycles obtained for mandelbrot : " <<  avg_cycles_vec / avg_cycles_simd << std::endl;
		}

	}


	_mm_free(vX1);
	_mm_free(vX2);
	_mm_free(vY);
	_mm_free(vY1);
	_mm_free(vY2);


}
Esempio n. 13
0
void sgemm( int m, int n, float *A, float *C )
{
	__m128 a;
	__m128 b;
	__m128 c;
	int i, j, k, l;
	int mod = m%4;
	int end = m/4*4;
	int total = n*m;
	float num[4];
	float* A_address;
	for( i = 0; i < end; i +=4 ){
		for( k = 0; k < m; k++ ) {
			c = _mm_setzero_ps();
			for( j = 0; j < total; j += m ) {
				a = _mm_loadu_ps(A + i + j);
				b = _mm_load1_ps(A + k + j);
				c = _mm_add_ps(c, _mm_mul_ps(a, b));
			}
			_mm_storeu_ps(C + i + k*m, c);
		}
	}//Looks about right to me for a matrix where m is divisible by 4.
	if (mod != 0){
		if (mod == 3){
			for( i = end; i < m; i +=4 ){
				for( k = 0; k < m; k++ ) {
					A_address = A + i;
					c = _mm_setzero_ps();
					for( j = 0; j < total; j += m ) {
						a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0);
						b = _mm_load1_ps(A + k + j);
						c = _mm_add_ps(c, _mm_mul_ps(a, b));
						A_address += m;
					}
					_mm_storeu_ps(num, c);
					for (l = 0; l < 3; l ++){
						*(C + i + k*m + l) = num[l];
					}
				}
			}
		}
		else if (mod == 2){
			for( i = end; i < m; i +=4 ){
				for( k = 0; k < m; k++ ) {
					A_address = A + i;
					c = _mm_setzero_ps();
					for( j = 0; j < total; j += m ) {
						a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0);
						b = _mm_load1_ps(A + k + j);
						c = _mm_add_ps(c, _mm_mul_ps(a, b));
						A_address += m;
					}
					_mm_storeu_ps(num, c);
					for (l = 0; l < 2; l ++){
						*(C + i + k*m + l) = num[l];
					}
				}
			}
		}
		else if (mod == 1){
			for( i = end; i < m; i +=4 ){
				for( k = 0; k < m; k++ ) {
					A_address = A + i;
					c = _mm_setzero_ps();
					for( j = 0; j < total; j += m ) {
						a = _mm_setr_ps(*(A_address), 0, 0, 0);
						b = _mm_load1_ps(A + k + j);
						c = _mm_add_ps(c, _mm_mul_ps(a, b));
						A_address += m;
					}
					_mm_storeu_ps(num, c);
					for (l = 0; l < 1; l ++){
						*(C + i + k*m + l) = num[l];
					}
				}
			}
		}
	}
}
//----------------------------------------------------------------
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
//----------------------------------------------------------------
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
{
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
	{
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
		
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);
	}

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
	{
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
		{
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));
		}

		if(_mm_movemask_ps(anyCloser))
		{
			return ePT_UNSURE; // okay, box might be in
		}
	}

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
}
Esempio n. 15
0
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}
Esempio n. 16
0
        float t[4];
        uint32_t offset = 0;

        float const d[2] = { line[2] - line[0], line[3] - line[1], };
        float const length_inv = 1.0f/sqrtf(d[0]*d[0] + d[1]*d[1]);
        float const n[2] = { d[1]*length_inv, -d[0]*length_inv, };
        float const distance = line[0]*n[0] + line[1]*n[1];

        /* TODO: investigate integer registers */
        __m128 const distance_4 = _mm_set1_ps(distance);
        __m128 const n0_4 = _mm_set1_ps(n[0]);
        __m128 const n1_4 = _mm_set1_ps(n[1]);

        __m128 const mask1_4 = _mm_set1_ps(1);
        __m128 const mask2_4 = _mm_set1_ps(2);
        __m128 const shift_4 = _mm_setr_ps(1, 3, 9, 27);

        /* process cell ids */
        for (uint32_t ii = cells_offset; ii < cells_count; ++ii) {
                uint32_t const id = cells[ii];

                if (id > dims[0]*dims[1]*dims[2]) {
                        printf("big id %u\n", id);
                        assert(0);
                }

                /* get coordinates of cell */
                /* cell grid is one less than data grid */
                uint32_t const i = id%dims[0];
                uint32_t const j = (id/dims[0])%dims[1];
                uint32_t const k = id/(dims[0]*dims[1]);
Esempio n. 17
0
void sgemm( int m, int n, float *A, float *C )
{
    __m128 a;
    __m128 a1;
    __m128 a2;
    __m128 a3;
    __m128 a4;
    
    __m128 b;
    __m128 b1;
    __m128 b2;
    __m128 b3;
    __m128 b4;
    __m128 b5;
    __m128 b6;
    __m128 b7;
    __m128 b8;
    __m128 b9;
    __m128 b10;
    __m128 b11;
    __m128 b12;
    __m128 b13;
    __m128 b14;
    __m128 b15;
    __m128 b16;
    
    __m128 c;
    __m128 c1;
    __m128 c2;
    __m128 c3;
    __m128 c4;
    
    int i, j, k, l;
    int mod = m%4;
    int end = m/4 * 4;
    int total = n*m;
    float num[4];
    float* A_address;
    float* C_address;
    int m4 = 4 * m;
    int m3 = 3 * m;
    int m2 = 2 * m;
    int end1 = total/m4 * m4;
    for( i = 0; i < end; i += 4 ){
	for( k = 0; k < end; k += 4 ) {
	    c1 = _mm_setzero_ps();
	    c2 = _mm_setzero_ps();
	    c3 = _mm_setzero_ps();
	    c4 = _mm_setzero_ps();
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    float* A_address21 = A + k + 1;
	    for( j = 0; j < end1; j += m4, A_address1 += m4, A_address2 += m4, A_address21 += m4){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A_address1 + m);
		a3 = _mm_loadu_ps(A_address1 + m2);
		a4 = _mm_loadu_ps(A_address1 + m3);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A_address2 + m);
		b3 = _mm_load1_ps(A_address2 + m2);
		b4 = _mm_load1_ps(A_address2 + m3);
		
		b5 = _mm_load1_ps(A_address21);
		b6 = _mm_load1_ps(A_address21 + m);
		b7 = _mm_load1_ps(A_address21 + m2);
		b8 = _mm_load1_ps(A_address21 + m3);
		
		b9 = _mm_load1_ps(A + k + 2 + j);
		b10 = _mm_load1_ps(A + k + 2 + j + m);
		b11 = _mm_load1_ps(A + k + 2 + j + m2);
		b12 = _mm_load1_ps(A + k + 2 + j + m3);
		
		b13 = _mm_load1_ps(A + k + 3 + j);
		b14 = _mm_load1_ps(A + k + 3 + j + m);
		b15 = _mm_load1_ps(A + k + 3 + j + m2);
		b16 = _mm_load1_ps(A + k + 3 + j + m3);
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a1, b1));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a2, b2));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a3, b3));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a4, b4));
		
		c2 = _mm_add_ps(c2, _mm_mul_ps(a1, b5));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a2, b6));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a3, b7));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a4, b8));
		
		c3 = _mm_add_ps(c3, _mm_mul_ps(a1, b9));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a2, b10));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a3, b11));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a4, b12));
		
		c4 = _mm_add_ps(c4, _mm_mul_ps(a1, b13));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a2, b14));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a3, b15));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a4, b16));
		
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b1 = _mm_load1_ps(A + k + j);
		b2 = _mm_load1_ps(A + k + 1 + j);
		b3 = _mm_load1_ps(A + k + 2 + j);
		b4 = _mm_load1_ps(A + k + 3 + j);
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a, b1));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a, b2));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a, b3));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a, b4));
	    }
	    _mm_storeu_ps(C + i + (k)*m, c1);
	    _mm_storeu_ps(C + i + (k+1)*m, c2);
	    _mm_storeu_ps(C + i + (k+2)*m, c3);
	    _mm_storeu_ps(C + i + (k+3)*m, c4);
	}
	for(k = end; k < m; k++){
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    c = _mm_setzero_ps();
	    for( j = 0; j < end1; j += m4, A_address1 += m4, A_address2 += m4){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A + i + j + m);
		a3 = _mm_loadu_ps(A + i + j + m2);
		a4 = _mm_loadu_ps(A + i + j + m3);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A + k + j + m);
		b3 = _mm_load1_ps(A + k + j + m2);
		b4 = _mm_load1_ps(A + k + j + m3);
		
		c = _mm_add_ps(c, _mm_mul_ps(a1, b1));
		c = _mm_add_ps(c, _mm_mul_ps(a2, b2));
		c = _mm_add_ps(c, _mm_mul_ps(a3, b3));
		c = _mm_add_ps(c, _mm_mul_ps(a4, b4));
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b = _mm_load1_ps(A + k + j);
		
		c = _mm_add_ps(c, _mm_mul_ps(a, b));
	    }
	    _mm_storeu_ps(C + i + k*m, c);
	}
    }
    if (mod != 0){
	if (mod == 3){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 3; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 2){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 2; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 1){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address), 0, 0, 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 1; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
    }
}	
Esempio n. 18
0
void CAEUtil::ClampArray(float *data, uint32_t count)
{
#if !defined(HAVE_SSE) || !defined(__SSE__)
  for (uint32_t i = 0; i < count; ++i)
    data[i] = SoftClamp(data[i]);

#else
  const __m128 c1 = _mm_set_ps1(27.0f);
  const __m128 c2 = _mm_set_ps1(27.0f + 9.0f);

  /* work around invalid alignment */
  while (((uintptr_t)data & 0xF) && count > 0)
  {
    data[0] = SoftClamp(data[0]);
    ++data;
    --count;
  }

  uint32_t even = count & ~0x3;
  for (uint32_t i = 0; i < even; i+=4, data+=4)
  {
    /* tanh approx clamp */
    __m128 dt = _mm_load_ps(data);
    __m128 tmp     = _mm_mul_ps(dt, dt);
    *(__m128*)data = _mm_div_ps(
      _mm_mul_ps(
        dt,
        _mm_add_ps(c1, tmp)
      ),
      _mm_add_ps(c2, tmp)
    );
  }

  if (even != count)
  {
    uint32_t odd = count - even;
    if (odd == 1)
      data[0] = SoftClamp(data[0]);
    else
    {
      __m128 dt;
      __m128 tmp;
      __m128 out;
      if (odd == 2)
      {
        /* tanh approx clamp */
        dt  = _mm_setr_ps(data[0], data[1], 0, 0);
        tmp = _mm_mul_ps(dt, dt);
        out = _mm_div_ps(
          _mm_mul_ps(
            dt,
            _mm_add_ps(c1, tmp)
          ),
          _mm_add_ps(c2, tmp)
        );

        data[0] = ((float*)&out)[0];
        data[1] = ((float*)&out)[1];
      }
      else
      {
        /* tanh approx clamp */
        dt  = _mm_setr_ps(data[0], data[1], data[2], 0);
        tmp = _mm_mul_ps(dt, dt);
        out = _mm_div_ps(
          _mm_mul_ps(
            dt,
            _mm_add_ps(c1, tmp)
          ),
          _mm_add_ps(c2, tmp)
        );

        data[0] = ((float*)&out)[0];
        data[1] = ((float*)&out)[1];
        data[2] = ((float*)&out)[2];
      }
    }
  }
#endif
}