void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha)
{
  __m128 s = _mm_set1_ps(alpha);
  __m128 v1234 = _mm_loadu_ps(u.data());
  __m128 v56xx = _mm_loadu_ps(u.data() + 4);

  __m128 v1212 = _mm_movelh_ps(v1234, v1234);
  __m128 v3434 = _mm_movehl_ps(v1234, v1234);
  __m128 v5656 = _mm_movelh_ps(v56xx, v56xx);

  __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212));

  _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212)));
  _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434)));
  _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656)));

  __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434));

  _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434)));
  _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656)));

  __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656));

  _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656)));
}
Esempio n. 2
0
void bumps_t::initialize (
  const bump_specifier_t & b0, const bump_specifier_t & b1)
{
  // Precompute the coefficients of four cubic polynomials in t, giving
  // the two smoothstep regions of the each of the two bump functions.
  v4f b0t = load4f (& b0.t0); // b0.t0 b0.t1 b0.t2 b0.t2
  v4f b1t = load4f (& b1.t0); // b1.t0 b1.t1 b1.t2 b1.t2
  v4f b0v = _mm_movelh_ps (load4f (& b0.v0), _mm_setzero_ps ()); // b0.v0 b0.v1
  v4f b1v = _mm_movelh_ps (load4f (& b1.v0), _mm_setzero_ps ()); // b1.v0 b1.v1
  v4f S = SHUFPS (b0t, b1t, (0, 2, 0, 2)); // b0.t0 b0.t2 b1.t0 b1.t2
  v4f T = SHUFPS (b0t, b1t, (1, 3, 1, 3)); // b0.t1 b0.t3 b1.t1 b1.t3
  v4f U = SHUFPS (b0v, b1v, (0, 2, 0, 2)); // b0.v0   0   b1.v0   0
  v4f V1 = SHUFPS (b0v, b1v, (1, 0, 1, 0)); // b0.v1 b0.v0 b1.v1 b1.v0
  v4f V2 = SHUFPS (b0v, b1v, (2, 1, 2, 1)); //   0   b0.v1   0   b1.v1
  v4f V = V1 - V2;
  v4f d = T - S;
  v4f a = T + S;
  v4f m = (V - U) / (d * d * d);
  store4f (c [0], U + m * S * S * (a + d + d));
  store4f (c [1], _mm_set1_ps (-6.0f) * m * S * T);
  store4f (c [2], _mm_set1_ps (+3.0f) * m * a);
  store4f (c [3], _mm_set1_ps (-2.0f) * m);
  store4f (S0, S);
  store4f (T0, T);
  store4f (U0, U);
  store4f (V0, V);
}
Esempio n. 3
0
/* apparently this is retarded */
void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2)
{
    /* for some reason not aligning the matrix segfaults,
     * but aligning deadlocks the program */
    /* aha we can heavily sse this:
     * 1. transpose mat2
     * 2. dotproduct the rows */

    /* 1. transpose mat2 */
    __m128 row0, row1, row2, row3;
    __m128 tmp0, tmp1, tmp2, tmp3;

    /* Load 4x4 mat2 from memory into four SSE registers. */
    row0 = _mm_load_ps( mat2[0] );
    row1 = _mm_load_ps( mat2[1] );
    row2 = _mm_load_ps( mat2[2] );
    row3 = _mm_load_ps( mat2[3] );

    /* Interleave bottom/top two pixels from two SSE registers with each other
     * into a single SSE register. */
    tmp0 = _mm_unpacklo_ps( row0, row1 );
    tmp2 = _mm_unpacklo_ps( row2, row3 );
    tmp1 = _mm_unpackhi_ps( row0, row1 );
    tmp3 = _mm_unpackhi_ps( row2, row3 );

    /* Move bottom/top two pixels from two SSE registers into one SSE register. */
    row0 = _mm_movelh_ps( tmp0, tmp2 );
    row1 = _mm_movehl_ps( tmp2, tmp0 );
    row2 = _mm_movelh_ps( tmp1, tmp3 );
    row3 = _mm_movehl_ps( tmp3, tmp1 );

    /* Store 4x4 matrix from all four SSE registers into memory. */
    _mm_store_ps( mat2[0], row0 );
    _mm_store_ps( mat2[1], row1 );
    _mm_store_ps( mat2[2], row2 );
    _mm_store_ps( mat2[3], row3 );

    /* 2. dotproduct the rows */
    /* OMG 16 DOT PRODUCTS */
    ret[0][0] = mul_asm(mat1[0], mat2[0]);
    ret[0][1] = mul_asm(mat1[0], mat2[1]);
    ret[0][2] = mul_asm(mat1[0], mat2[2]);
    ret[0][3] = mul_asm(mat1[0], mat2[3]);
    ret[1][0] = mul_asm(mat1[1], mat2[0]);
    ret[1][1] = mul_asm(mat1[1], mat2[1]);
    ret[1][2] = mul_asm(mat1[1], mat2[2]);
    ret[1][3] = mul_asm(mat1[1], mat2[3]);
    ret[2][0] = mul_asm(mat1[2], mat2[0]);
    ret[2][1] = mul_asm(mat1[2], mat2[1]);
    ret[2][2] = mul_asm(mat1[2], mat2[2]);
    ret[2][3] = mul_asm(mat1[2], mat2[3]);
    ret[3][0] = mul_asm(mat1[3], mat2[0]);
    ret[3][1] = mul_asm(mat1[3], mat2[1]);
    ret[3][2] = mul_asm(mat1[3], mat2[2]);
    ret[3][3] = mul_asm(mat1[3], mat2[3]);

    return;
}
Esempio n. 4
0
/// Transform this box using the specified transform matrix.
///
/// @param[in] rTransform  Matrix by which to transform.
void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform )
{
    // Expand each corner position.
    Register minVec = m_minimum.GetSimdVector();
    Register maxVec = m_maximum.GetSimdVector();

    Vector3Soa corners0;
    corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    corners0.m_z = _mm_unpackhi_ps( minVec, maxVec );
    corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z );

    Vector3Soa corners1;
    corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners1.m_y = corners0.m_y;
    corners1.m_z = corners0.m_z;

    // Transform all corners by the provided transformation matrix.
    Matrix44Soa transformSplat( rTransform );
    transformSplat.TransformPoint( corners0, corners0 );
    transformSplat.TransformPoint( corners1, corners1 );

    // Compute the minimum.
    Register minX = Simd::MinF32( corners0.m_x, corners1.m_x );
    Register minY = Simd::MinF32( corners0.m_y, corners1.m_y );
    Register minXYLo = _mm_unpacklo_ps( minX, minY );
    Register minXYHi = _mm_unpackhi_ps( minX, minY );
    Register minXY = Simd::MinF32( minXYLo, minXYHi );

    Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z );
    Register minZLo = _mm_unpacklo_ps( minZ, minZ );
    Register minZHi = _mm_unpackhi_ps( minZ, minZ );
    minZ = Simd::MinF32( minZLo, minZHi );

    Register minLo = _mm_movelh_ps( minXY, minZ );
    Register minHi = _mm_movehl_ps( minZ, minXY );

    m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) );

    // Compute the maximum.
    Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x );
    Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y );
    Register maxXYLo = _mm_unpacklo_ps( maxX, maxY );
    Register maxXYHi = _mm_unpackhi_ps( maxX, maxY );
    Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi );

    Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z );
    Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ );
    Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ );
    maxZ = Simd::MaxF32( maxZLo, maxZHi );

    Register maxLo = _mm_movelh_ps( maxXY, maxZ );
    Register maxHi = _mm_movehl_ps( maxZ, maxXY );

    m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) );
}
Esempio n. 5
0
float calcCubicNoiseValSSE(const vec3 p)    
{
    int ix, iy, iz;
    __m128 fx, fy;
    float fz;

    ix = (int)floor(p[0]);
    fx = _mm_set_ps1(p[0] - ix);    
    iy = (int)floor(p[1]);
    fy = _mm_set_ps1(p[1] - iy);
    iz = (int)floor(p[2]);
    fz = p[2] - iz;

    uSIMD k0, k1, k2, k3;
    __m128 out0, out1, out2, out3;

    for(int k = -1; k <= 2; k++)
    {
        for(int j = -1; j <= 2; j++)
        {
            k0.a[j+1] = getLatticeVal(ix-1, iy + j, iz + k);
            k1.a[j+1] = getLatticeVal(ix+0, iy + j, iz + k);
            k2.a[j+1] = getLatticeVal(ix+1, iy + j, iz + k);
            k3.a[j+1] = getLatticeVal(ix+2, iy + j, iz + k);            
        }
        switch(k)
        {
        case -1:
            out0 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));
            break;
        case 0:
            out1 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        case 1:
            out2 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        case 2:
            out3 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m));            
            break;
        }
    }
    // Transpose the matrix formed by the out vectors.
    __m128 t1 = _mm_movelh_ps(out1, out0);
    __m128 t2 = _mm_movehl_ps(out0, out1);
    __m128 t3 = _mm_movelh_ps(out3, out2);
    __m128 t4 = _mm_movehl_ps(out2, out3);
    k0.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(0, 2, 0, 2));
    k1.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 3, 1, 3));
    k2.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(0, 2, 0, 2));
    k3.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 3, 1, 3));                    

    uSIMD final_knots;
    final_knots.m  = fourKnotSplineSSE(&fy, &(k0.m), &(k1.m), &(k2.m), &(k3.m));
    return clamp(fourKnotSpline(fz, final_knots.a), -1.0f, 1.0f);
}
Esempio n. 6
0
Quat MUST_USE_RESULT Quat::RotateFromTo(const float4 &sourceDirection, const float4 &targetDirection)
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 12.289 nsecs / 33.144 ticks, Avg: 12.489 nsecs, Worst: 14.210 nsecs
	simd4f cosAngle = dot4_ps(sourceDirection.v, targetDirection.v);
	cosAngle = negate3_ps(cosAngle); // [+ - - -]
	// XYZ channels use the trigonometric formula sin(x/2) = +/-sqrt(0.5-0.5*cosx))
	// The W channel uses the trigonometric formula cos(x/2) = +/-sqrt(0.5+0.5*cosx))
	simd4f half = set1_ps(0.5f);
	simd4f cosSinHalfAngle = sqrt_ps(add_ps(half, mul_ps(half, cosAngle))); // [cos(x/2), sin(x/2), sin(x/2), sin(x/2)]
	simd4f axis = cross_ps(sourceDirection.v, targetDirection.v);
	simd4f recipLen = rsqrt_ps(dot4_ps(axis, axis));
	axis = mul_ps(axis, recipLen); // [0 z y x]
	// Set the w component to one.
	simd4f one = add_ps(half, half); // [1 1 1 1]
	simd4f highPart = _mm_unpackhi_ps(axis, one); // [_ _ 1 z]
	axis = _mm_movelh_ps(axis, highPart); // [1 z y x]
	Quat q;
	q.q = mul_ps(axis, cosSinHalfAngle);
	return q;
#else
	// Best: 19.970 nsecs / 53.632 ticks, Avg: 20.197 nsecs, Worst: 21.122 nsecs
	assume(EqualAbs(sourceDirection.w, 0.f));
	assume(EqualAbs(targetDirection.w, 0.f));
	return Quat::RotateFromTo(sourceDirection.xyz(), targetDirection.xyz());
#endif
}
Esempio n. 7
0
matrix4 matrix4::transposed() const
{
#ifdef __SSE__
    __m128 tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm_unpacklo_ps(x.v, y.v);
    tmp2 = _mm_unpacklo_ps(z.v, w.v);
    tmp1 = _mm_unpackhi_ps(x.v, y.v);
    tmp3 = _mm_unpackhi_ps(z.v, w.v);

    return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1));
#else
    return matrix4(float4(x.x, y.x, z.x, w.x),
                   float4(x.y, y.y, z.y, w.y),
                   float4(x.z, y.z, z.z, w.z),
                   float4(x.w, y.w, z.w, w.w));
#endif
}
Esempio n. 8
0
/// Compute the corners of this view frustum.
///
/// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight
/// corners) or whether an infinite far clip plane is used (four corners).
///
/// Note that this assumes that the frustum is always properly defined, with each possible combination of
/// neighboring clip planes intersecting at a valid point.
///
/// @param[out] pCorners  Array in which the frustum corners will be stored.  This must point to a region of memory
///                       large enough for four points if this frustum has an infinite far clip plane, or eight
///                       points if this frustum has a normal far clip plane.
///
/// @return  Number of clip planes computed (either four or eight).
size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const
{
    HELIUM_ASSERT( pCorners );

    // Compute the corners in struct-of-arrays format.
    HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST;

    size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ );
    HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 );

    // Swizzle the results and store in the output array.
    Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX );
    Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY );
    Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ );

    Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
    Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

    pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
    pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
    pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
    pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );

    if( cornerCount == 8 )
    {
        cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 );
        cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 );
        cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 );

        xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
        xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
        zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
        zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

        pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
        pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
        pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
        pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );
    }

    return cornerCount;
}
Esempio n. 9
0
void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}
Esempio n. 10
0
void matrix3_transpose(struct matrix3 *dst, const struct matrix3 *m)
{
	__m128 tmp1, tmp2;
	vec3_transform(&dst->t, &m->t, m);
	vec3_neg(&dst->t, &dst->t);

        tmp1 = _mm_movelh_ps(m->x.m, m->y.m);
        tmp2 = _mm_movehl_ps(m->y.m, m->x.m);
        dst->x.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 0, 2, 0));
        dst->y.m = _mm_shuffle_ps(tmp1, m->z.m, _MM_SHUFFLE(3, 1, 3, 1));
        dst->z.m = _mm_shuffle_ps(tmp2, m->z.m, _MM_SHUFFLE(3, 2, 2, 0));
}
Esempio n. 11
0
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
{
#ifdef URHO3D_SSE
    const __m128 one = _mm_set_ss(1.f);
    __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one));
    __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one));
    __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
    __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
    __m128 m0 = _mm_loadu_ps(&transform.m00_);
    __m128 m1 = _mm_loadu_ps(&transform.m10_);
    __m128 m2 = _mm_loadu_ps(&transform.m20_);
    __m128 r0 = _mm_mul_ps(m0, centerPoint);
    __m128 r1 = _mm_mul_ps(m1, centerPoint);
    __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1));
    __m128 r2 = _mm_mul_ps(m2, centerPoint);
    const __m128 zero = _mm_setzero_ps();
    __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero));
    __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
    __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
    __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
    t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y));
    t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero));
    __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir));
#else
    Vector3 newCenter = transform * Center();
    Vector3 oldEdge = Size() * 0.5f;
    Vector3 newEdge = Vector3(
        Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_,
        Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_,
        Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_
    );

    return BoundingBox(newCenter - newEdge, newCenter + newEdge);
#endif
}
Esempio n. 12
0
/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}
Esempio n. 13
0
    inline vector4f haddp(const vector4f* row)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                           _mm_hadd_ps(row[2], row[3]));
#else
        __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
        __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
        __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
        tmp0 = _mm_add_ps(tmp0, tmp1);
        tmp1 = _mm_unpacklo_ps(row[2], row[3]);
        tmp1 = _mm_add_ps(tmp1, tmp2);
        tmp2 = _mm_movehl_ps(tmp1, tmp0);
        tmp0 = _mm_movelh_ps(tmp0, tmp1);
        return _mm_add_ps(tmp0, tmp2);
#endif
    }
Esempio n. 14
0
inline __m128 CalcWeights(float x, float y)
{
 __m128 ssx = _mm_set_ss(x);
 __m128 ssy = _mm_set_ss(y);
 __m128 psXY = _mm_unpacklo_ps(ssx, ssy);      // 0 0 y x

 //__m128 psXYfloor = _mm_floor_ps(psXY); // use this line for if you have SSE4
 __m128 psXYfloor = _mm_cvtepi32_ps(_mm_cvtps_epi32(psXY));
 __m128 psXYfrac = _mm_sub_ps(psXY, psXYfloor); // = frac(psXY)
 
 __m128 psXYfrac1 = _mm_sub_ps(CONST_1111, psXYfrac); // ? ? (1-y) (1-x)
 __m128 w_x = _mm_unpacklo_ps(psXYfrac1, psXYfrac);   // ? ?     x (1-x)
        w_x = _mm_movelh_ps(w_x, w_x);      // x (1-x) x (1-x)
 __m128 w_y = _mm_shuffle_ps(psXYfrac1, psXYfrac, _MM_SHUFFLE(1, 1, 1, 1)); // y y (1-y) (1-y)

 // complete weight vector
 return _mm_mul_ps(w_x, w_y);
}
Esempio n. 15
0
void Quat::ToAxisAngle(float4 &axis, float &angle) const
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 35.332 nsecs / 94.328 ticks, Avg: 35.870 nsecs, Worst: 57.607 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	angle = Acos(s4f_x(cosAngle)) * 2.f;
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	axis.v = _mm_movelh_ps(a, highPart); // [0 z y x]
#else
	// Best: 85.258 nsecs / 227.656 ticks, Avg: 85.492 nsecs, Worst: 86.410 nsecs
	ToAxisAngle(reinterpret_cast<float3&>(axis), angle);
	axis.w = 0.f;
#endif
}
Esempio n. 16
0
void Quat::SetFromAxisAngle(const float4 &axis, float angle)
{
	assume1(EqualAbs(axis.w, 0.f), axis);
	assume2(axis.IsNormalized(1e-4f), axis, axis.Length4());
	assume1(MATH_NS::IsFinite(angle), angle);

#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE2)
	// Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs
	simd4f halfAngle = set1_ps(0.5f*angle);
	simd4f sinAngle, cosAngle;
	sincos_ps(halfAngle, &sinAngle, &cosAngle);
	simd4f quat = mul_ps(axis, sinAngle);

	// Set the w component to cosAngle.
	simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z]
	q = _mm_movelh_ps(quat, highPart); // [1 z y x]
#else
	// Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs
	SetFromAxisAngle(axis.xyz(), angle);
#endif
}
Esempio n. 17
0
v4f step_t::operator () (float t) const
{
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
}
void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	for(unsigned int c = 0; c < channels; ++c) {
		__m128 sum = _mm_setzero_ps();
		__m128 v_frac = _mm_set1_ps(frac);
		float *input2 = input + c;
		for(unsigned int i = 0; i < filter_length / 4; ++i) {
			__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
			coef1 += 4; coef2 += 4;
			__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
			__m128 v_input1 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input2 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input3 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input4 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4));
			sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
		}
		__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
		__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
		_mm_store_ss(output + c, sum3);
	}
}
Esempio n. 19
0
/// Test whether this frustum intersects a given axis-aligned bounding box in world space.
///
/// @param[in] rBox  Box to test.
///
/// @return  True if the box intersects this frustum, false if not.
bool Helium::Simd::Frustum::Intersects( const AaBox& rBox ) const
{
    Helium::Simd::Register boxMinVec = rBox.GetMinimum().GetSimdVector();
    Helium::Simd::Register boxMaxVec = rBox.GetMaximum().GetSimdVector();

    Helium::Simd::Register boxX0 = _mm_shuffle_ps( boxMinVec, boxMinVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxX1 = _mm_shuffle_ps( boxMaxVec, boxMaxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxY = _mm_shuffle_ps( boxMinVec, boxMaxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    Helium::Simd::Register boxZ = _mm_unpackhi_ps( boxMinVec, boxMaxVec );
    boxZ = _mm_movelh_ps( boxZ, boxZ );

    PlaneSoa plane;
    Vector3Soa points( boxX0, boxY, boxZ );
    Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros();

    size_t planeCount = ( m_bInfiniteFarClip ? PLANE_FAR : PLANE_MAX );
    for( size_t planeIndex = 0; planeIndex < planeCount; ++planeIndex )
    {
        plane.Load1Splat(
            m_planeA + planeIndex,
            m_planeB + planeIndex,
            m_planeC + planeIndex,
            m_planeD + planeIndex );

        points.m_x = boxX0;
        Helium::Simd::Mask containsPoints0 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        points.m_x = boxX1;
        Helium::Simd::Mask containsPoints1 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        int resultMask = _mm_movemask_ps( Helium::Simd::Or( containsPoints0, containsPoints1 ) );
        if( resultMask == 0 )
        {
            return false;
        }
    }

    return true;
}
Esempio n. 20
0
vec Quat::Axis() const
{
	assume2(this->IsNormalized(), *this, this->Length());
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 6.145 nsecs / 16.88 ticks, Avg: 6.367 nsecs, Worst: 6.529 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	a = _mm_movelh_ps(a, highPart); // [0 z y x]
	return FLOAT4_TO_DIR(a);
#else
	// Best: 6.529 nsecs / 18.152 ticks, Avg: 6.851 nsecs, Worst: 8.065 nsecs

	// Convert cos to sin via the identity sin^2 + cos^2 = 1, and fuse reciprocal and square root to the same instruction,
	// since we are about to divide by it.
	float rcpSinAngle = RSqrt(1.f - w*w);
	return DIR_VEC(x, y, z) * rcpSinAngle;
#endif
}
void decomp_gamma2_plus( spinor_array src, halfspinor_array dst) 
{
  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm6;
  __m128 xmm7;
  __m128 xmm8;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm9;
  __m128 xmm10;
  __m128 xmm11;


  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0
  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);

#endif
 
  /* Swap the lower components */
  xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1);
  xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1);
  xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1);

  xmm9 = _mm_xor_ps(xmm6, signs14.vector);
  xmm10 = _mm_xor_ps(xmm7, signs14.vector);
  xmm11 = _mm_xor_ps(xmm8, signs14.vector);

  /* Add */
  xmm0 = _mm_add_ps(xmm0, xmm9);
  xmm1 = _mm_add_ps(xmm1, xmm10);
  xmm2 = _mm_add_ps(xmm2, xmm11);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);


}
void decomp_gamma0_minus( spinor_array src, halfspinor_array dst) 
{

  /* c <-> color, s <-> spin */

  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm6;
  __m128 xmm7;
  __m128 xmm8;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm9;
  __m128 xmm10;
  __m128 xmm11;



  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

 
  /* Swap the lower components  and multiply by -i*/
  xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0x1b);
  xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0x1b);
  xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0x1b);


  xmm9 = _mm_xor_ps(xmm6, signs24.vector);
  xmm10 = _mm_xor_ps(xmm7, signs24.vector);
  xmm11 = _mm_xor_ps(xmm8, signs24.vector);


  /* Add */
  xmm0 = _mm_add_ps(xmm0, xmm9);
  xmm1 = _mm_add_ps(xmm1, xmm10);
  xmm2 = _mm_add_ps(xmm2, xmm11);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);
  
  
}
Esempio n. 23
0
  template<class Dummy>
  struct call<tag::sort_(tag::simd_<tag::type32_, tag::sse_> ),
              tag::cpu_, Dummy> : callable
  {
    template<class Sig> struct result;
    template<class This,class A0>
    struct result<This(A0)>
      : meta::strip<A0>{};//

    NT2_FUNCTOR_CALL(1)
    {
      typedef typename meta::as_real<A0>::type flt;
      A0 a =  {a0};
      A0 b =  {NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, a0), NT2_CAST(flt, a0)))};
      comp(a, b);
      a = NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, a), NT2_CAST(flt, b)));
      b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(1, 3, 1, 3)));
      comp(a, b);
      A0 c = {NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, b), NT2_CAST(flt, b)))};
      A0 d = {a};
      comp(c, d);
      a = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, c), NT2_CAST(flt, a), NT2_SH(3, 2, 0, 0)));
      b = NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, b), NT2_CAST(flt, d)));
      b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(3, 1, 0, 2)));
      return b;
    }
  private :
    template < class T > static inline void comp(T & a,T & b)
    {
      T c =  nt2::min(a, b);
      b = nt2::max(a, b);
Esempio n. 24
0
int main()
{
#ifndef __EMSCRIPTEN__
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif

	printf ("{ \"workload\": %u, \"results\": [\n", N);
	assert(N%4 == 0); // Don't care about the tail for now.
	float *src = get_src();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src[i] = (float)rand() / RAND_MAX;
	float *src2 = get_src2();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src2[i] = (float)rand() / RAND_MAX;
	float *dst = get_dst();//(float*)aligned_alloc(16, N*sizeof(float));

	float scalarTime;
	SETCHART("load");
	START();
		for(int i = 0; i < N; ++i)
			dst[i] = src[i];
	ENDSCALAR(checksum_dst(dst), "scalar");

	LS_TEST("_mm_load_ps", _mm_load_ps, 0, _mm_store_ps, 0);
	LS_TEST("_mm_load_ps1", _mm_load_ps1, 1, _mm_store_ps, 0);
	LS_TEST("_mm_load_ss", _mm_load_ss, 1, _mm_store_ps, 0);
	LS_TEST("_mm_load1_ps", _mm_load1_ps, 1, _mm_store_ps, 0);
	// _mm_loadh_pi
	// _mm_loadl_pi
	LS_TEST("_mm_loadr_ps", _mm_loadr_ps, 0, _mm_store_ps, 0);
	LS_TEST("_mm_loadu_ps", _mm_loadu_ps, 1, _mm_store_ps, 0);

	SETCHART("set");
	SS_TEST("_mm_set_ps", _mm_set_ps(src[i+2], src[i+1], src[i+5], src[i+0]));
	SS_TEST("_mm_set_ps1", _mm_set_ps1(src[i]));
	SS_TEST("_mm_set_ss", _mm_set_ss(src[i]));
	SS_TEST("_mm_set1_ps", _mm_set1_ps(src[i]));
	SS_TEST("_mm_setr_ps", _mm_set_ps(src[i+2], src[i+1], src[i+5], src[i+0]));
	SS_TEST("_mm_setzero_ps", _mm_setzero_ps());

	SETCHART("move");
	SS_TEST("_mm_move_ss", _mm_move_ss(_mm_load_ps(src+i), _mm_load_ps(src2+i)));
	SS_TEST("_mm_movehl_ps", _mm_movehl_ps(_mm_load_ps(src+i), _mm_load_ps(src2+i)));
	SS_TEST("_mm_movelh_ps", _mm_movelh_ps(_mm_load_ps(src+i), _mm_load_ps(src2+i)));

	SETCHART("store");
	LS_TEST("_mm_store_ps", _mm_load_ps, 0, _mm_store_ps, 0);
	LS_TEST("_mm_store_ps1", _mm_load_ps, 0, _mm_store_ps1, 0);
	LS_TEST("_mm_store_ss", _mm_load_ps, 0, _mm_store_ss, 1);
	LS64_TEST("_mm_storeh_pi", _mm_load_ps, 0, _mm_storeh_pi, 1);
	LS64_TEST("_mm_storel_pi", _mm_load_ps, 0, _mm_storel_pi, 1);
	LS_TEST("_mm_storer_ps", _mm_load_ps, 0, _mm_storer_ps, 0);
	LS_TEST("_mm_storeu_ps", _mm_load_ps, 0, _mm_storeu_ps, 1);
	LS_TEST("_mm_stream_ps", _mm_load_ps, 0, _mm_stream_ps, 0);

	SETCHART("arithmetic");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add");
	BINARYOP_TEST("_mm_add_ps", _mm_add_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_add_ss", _mm_add_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div");
	BINARYOP_TEST("_mm_div_ps", _mm_div_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_div_ss", _mm_div_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul");
	BINARYOP_TEST("_mm_mul_ps", _mm_mul_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_mul_ss", _mm_mul_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub");
	BINARYOP_TEST("_mm_sub_ps", _mm_sub_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_sub_ss", _mm_sub_ss, _mm_load_ps(src), _mm_load_ps(src2));

	SETCHART("roots");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = 1.f / dst[0]; dst[1] = 1.f / dst[1]; dst[2] = 1.f / dst[2]; dst[3] = 1.f / dst[3]; } ENDSCALAR(checksum_dst(dst), "scalar rcp");
	UNARYOP_TEST("_mm_rcp_ps", _mm_rcp_ps, _mm_load_ps(src));
	UNARYOP_TEST("_mm_rcp_ss", _mm_rcp_ss, _mm_load_ps(src));

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = 1.f / sqrtf(dst[0]); dst[1] = 1.f / sqrtf(dst[1]); dst[2] = 1.f / sqrtf(dst[2]); dst[3] = 1.f / sqrtf(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar rsqrt");
	UNARYOP_TEST("_mm_rsqrt_ps", _mm_rsqrt_ps, _mm_load_ps(src));
	UNARYOP_TEST("_mm_rsqrt_ss", _mm_rsqrt_ss, _mm_load_ps(src));

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrtf(dst[0]); dst[1] = sqrtf(dst[1]); dst[2] = sqrtf(dst[2]); dst[3] = sqrtf(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt");
	UNARYOP_TEST("_mm_sqrt_ps", _mm_sqrt_ps, _mm_load_ps(src));
	UNARYOP_TEST("_mm_sqrt_ss", _mm_sqrt_ss, _mm_load_ps(src));

	SETCHART("logical");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) & fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) & fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) & fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) & fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and");
	BINARYOP_TEST("_mm_and_ps", _mm_and_ps, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf((~fcastu(dst[0])) & fcastu(src2[0])); dst[1] = ucastf((~fcastu(dst[1])) & fcastu(src2[1])); dst[2] = ucastf((~fcastu(dst[2])) & fcastu(src2[2])); dst[3] = ucastf((~fcastu(dst[3])) & fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot");
	BINARYOP_TEST("_mm_andnot_ps", _mm_andnot_ps, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) | fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) | fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) | fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) | fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or");
	BINARYOP_TEST("_mm_or_ps", _mm_or_ps, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastf(fcastu(dst[0]) ^ fcastu(src2[0])); dst[1] = ucastf(fcastu(dst[1]) ^ fcastu(src2[1])); dst[2] = ucastf(fcastu(dst[2]) ^ fcastu(src2[2])); dst[3] = ucastf(fcastu(dst[3]) ^ fcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor");
	BINARYOP_TEST("_mm_xor_ps", _mm_xor_ps, _mm_load_ps(src), _mm_load_ps(src2));

	SETCHART("cmp");
#ifndef __EMSCRIPTEN__ // TODO: Disabled due to https://github.com/kripken/emscripten/issues/2841
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp==");
	BINARYOP_TEST("_mm_cmpeq_ps", _mm_cmpeq_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmpeq_ss", _mm_cmpeq_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>=");
	BINARYOP_TEST("_mm_cmpge_ps", _mm_cmpge_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmpge_ss", _mm_cmpge_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>");
	BINARYOP_TEST("_mm_cmpgt_ps", _mm_cmpgt_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmpgt_ss", _mm_cmpgt_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<=");
	BINARYOP_TEST("_mm_cmple_ps", _mm_cmple_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmple_ss", _mm_cmple_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<");
	BINARYOP_TEST("_mm_cmplt_ps", _mm_cmplt_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmplt_ss", _mm_cmplt_ss, _mm_load_ps(src), _mm_load_ps(src2));
#endif

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord");
	BINARYOP_TEST("_mm_cmpord_ps", _mm_cmpord_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmpord_ss", _mm_cmpord_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastf(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastf(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord");
	BINARYOP_TEST("_mm_cmpunord_ps", _mm_cmpunord_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_cmpunord_ss", _mm_cmpunord_ss, _mm_load_ps(src), _mm_load_ps(src2));

	SETCHART("max");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max");
	BINARYOP_TEST("_mm_max_ps", _mm_max_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_max_ss", _mm_max_ss, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min");
	BINARYOP_TEST("_mm_min_ps", _mm_min_ps, _mm_load_ps(src), _mm_load_ps(src2));
	BINARYOP_TEST("_mm_min_ss", _mm_min_ss, _mm_load_ps(src), _mm_load_ps(src2));

	SETCHART("shuffle");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle");
//	BINARYOP_TEST("_mm_shuffle_ps", _mm_shuffle_ps, _mm_load_ps(src), _mm_load_ps(src2));
	START();
		__m128 o0 = _mm_load_ps(src);
		__m128 o1 = _mm_load_ps(src2);
		for(int i = 0; i < N; i += 4)
			o0 = _mm_shuffle_ps(o0, o1, _MM_SHUFFLE(1, 0, 3, 2));
		_mm_store_ps(dst, o0);
	END(checksum_dst(dst), "_mm_shuffle_ps");

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_ps");
	BINARYOP_TEST("_mm_unpackhi_ps", _mm_unpackhi_ps, _mm_load_ps(src), _mm_load_ps(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_ps");
	BINARYOP_TEST("_mm_unpacklo_ps", _mm_unpacklo_ps, _mm_load_ps(src), _mm_load_ps(src2));
	printf("]}\n");
/*
	printf("Finished!\n");
	printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec());
	printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec());
	if (scalarTotalTicks > simdTotalTicks)
		printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks);
	else
		printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks);
*/
#ifdef __EMSCRIPTEN__
	fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent"));
	printf("/*Test finished! Now please close Firefox to continue with benchmark_sse1.py.*/\n");
#endif
	exit(0);
}
Esempio n. 25
0
/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  // get our data struct:
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;

  // adjust to zoom size:
  const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
  const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
  if(P <= 1)
  {
    // nothing to do from this distance:
    memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);
    return;
  }

  // adjust to Lab, make L more important
  // float max_L = 100.0f, max_C = 256.0f;
  // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  float max_L = 120.0f, max_C = 512.0f;
  float nL = 1.0f/max_L, nC = 1.0f/max_C;
  const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f };

  float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads());
  // we want to sum up weights in col[3], so need to init to 0:
  memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);

  // for each shift vector
  for(int kj=-K;kj<=K;kj++)
  {
    for(int ki=-K;ki<=K;ki++)
    {
      int inited_slide = 0;
      // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors)
      // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory
#ifdef _OPENMP
#  pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa)
#endif
      for(int j=0; j<roi_out->height; j++)
      {
        if(j+kj < 0 || j+kj >= roi_out->height) continue;
        float *S = Sa + dt_get_thread_num() * roi_out->width;
        const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki);
        float *out = ((float *)ovoid) + 4*roi_out->width*j;

        const int Pm = MIN(MIN(P, j+kj), j);
        const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j);
        // first line of every thread
        // TODO: also every once in a while to assert numerical precision!
        if(!inited_slide)
        {
          // sum up a line 
          memset(S, 0x0, sizeof(float)*roi_out->width);
          for(int jj=-Pm;jj<=PM;jj++)
          {
            int i = MAX(0, -ki);
            float *s = S + i;
            const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj);
            const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki);
            const int last = roi_out->width + MIN(0, -ki);
            for(; i<last; i++, inp+=4, inps+=4, s++)
            {
              for(int k=0;k<3;k++)
                s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k];
            }
          }
          // only reuse this if we had a full stripe
          if(Pm == P && PM == P) inited_slide = 1;
        }

        // sliding window for this line:
        float *s = S;
        float slide = 0.0f;
        // sum up the first -P..P
        for(int i=0;i<2*P+1;i++) slide += s[i];
        for(int i=0; i<roi_out->width; i++)
        {
          if(i-P > 0 && i+P<roi_out->width)
            slide += s[P] - s[-P-1];
          if(i+ki >= 0 && i+ki < roi_out->width)
          {
            const __m128 iv = { ins[0], ins[1], ins[2], 1.0f };
            _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide)));
          }
          s   ++;
          ins += 4;
          out += 4;
        }
        if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height)
        {
          // sliding window in j direction:
          int i = MAX(0, -ki);
          float *s = S + i;
          const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1);
          const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki);
          const float *inm  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P);
          const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki);
          const int last = roi_out->width + MIN(0, -ki);
          for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
          /* Process most of the line 4 pixels at a time */
          for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4)
          {
            __m128 sv = _mm_load_ps(s);
            const __m128 inp1 = _mm_load_ps(inp)    - _mm_load_ps(inps);
            const __m128 inp2 = _mm_load_ps(inp+4)  - _mm_load_ps(inps+4);
            const __m128 inp3 = _mm_load_ps(inp+8)  - _mm_load_ps(inps+8);
            const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12);

            const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2);
            const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4);
            const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2);
            const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4);

            const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo);
            sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]);

            const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo);
            sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]);

            const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi);
            sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]);

            const __m128 inm1 = _mm_load_ps(inm)    - _mm_load_ps(inms);
            const __m128 inm2 = _mm_load_ps(inm+4)  - _mm_load_ps(inms+4);
            const __m128 inm3 = _mm_load_ps(inm+8)  - _mm_load_ps(inms+8);
            const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12);

            const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2);
            const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4);
            const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2);
            const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4);

            const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo);
            sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]);

            const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo);
            sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]);

            const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi);
            sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]);

            _mm_store_ps(s, sv);
          }
          for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
        }
        else inited_slide = 0;
      }
    }
  }
  // normalize and apply chroma/luma blending
  // bias a bit towards higher values for low input values:
  const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));
  const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);
#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d)
#endif
  for(int j=0; j<roi_out->height; j++)
  {
    float *out = ((float *)ovoid) + 4*roi_out->width*j;
    float *in  = ((float *)ivoid) + 4*roi_out->width*j;
    for(int i=0; i<roi_out->width; i++)
    {
      _mm_store_ps(out, _mm_add_ps(
          _mm_mul_ps(_mm_load_ps(in),  invert),
          _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));
      out += 4;
      in  += 4;
    }
  }
  // free shared tmp memory:
  free(Sa);
}
Esempio n. 26
0
void
transform8_otherrgb_avx(ThreadInfo* t)
{
	RS_IMAGE16 *input = t->input;
	GdkPixbuf *output = t->output;
	RS_MATRIX3 *matrix = t->matrix;
	gint x,y;
	gint width;

	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
	for (x = 0; x < 4; x++ ) {
		mat_ps[x] = matrix->coeff[0][0];
		mat_ps[x+4] = matrix->coeff[0][1];
		mat_ps[x+8] = matrix->coeff[0][2];
		mat_ps[12+x] = matrix->coeff[1][0];
		mat_ps[12+x+4] = matrix->coeff[1][1];
		mat_ps[12+x+8] = matrix->coeff[1][2];
		mat_ps[24+x] = matrix->coeff[2][0];
		mat_ps[24+x+4] = matrix->coeff[2][1];
		mat_ps[24+x+8] = matrix->coeff[2][2];
	}
	
	int start_x = t->start_x;
	/* Always have aligned input and output adress */
	if (start_x & 3)
		start_x = ((start_x) / 4) * 4;
	
	int complete_w = t->end_x - start_x;
	/* If width is not multiple of 4, check if we can extend it a bit */
	if (complete_w & 3)
	{
		if ((t->end_x+4) < input->w)
			complete_w = ((complete_w+3) / 4 * 4);
	}
	__m128 gamma = _mm_set1_ps(t->output_gamma);

	for(y=t->start_y ; y<t->end_y ; y++)
	{
		gushort *i = GET_PIXEL(input, start_x, y);
		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
		gboolean aligned_write = !((guintptr)(o)&0xf);

		width = complete_w >> 2;

		while(width--)
		{
			/* Load and convert to float */
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
			_mm_prefetch(i + 64, _MM_HINT_NTA);
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128i p2 =_mm_unpackhi_epi16(in, zero);
			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);
			__m128 p2f  = _mm_cvtepi32_ps(p2);
			__m128 p3f  = _mm_cvtepi32_ps(p3);
			__m128 p4f  = _mm_cvtepi32_ps(p4);
			
			/* Convert to planar */
			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
			__m128 b = _mm_movelh_ps(b1b0, b3b2);

			/* Apply matrix to convert to sRGB */
			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			/* Normalize to 0->1 and clamp */
			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));

			/* Apply Gamma */
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
			b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));

			/* Convert to 8 bit unsigned  and interleave*/
			__m128i r_i = _mm_cvtps_epi32(r);
			__m128i g_i = _mm_cvtps_epi32(g);
			__m128i b_i = _mm_cvtps_epi32(b);
			
			r_i = _mm_packs_epi32(r_i, r_i);
			g_i = _mm_packs_epi32(g_i, g_i);
			b_i = _mm_packs_epi32(b_i, b_i);

			/* Set alpha value to 255 and store */
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
	
			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));

			if (aligned_write)
				_mm_store_si128((__m128i*)o, p1);
			else
				_mm_storeu_si128((__m128i*)o, p1);

			i += 16;
			o += 16;
		}
		/* Process remaining pixels */
		width = complete_w & 3;
		while(width--)
		{
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);

			/* Splat r,g,b */
			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0));
			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1));
			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));

			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
			r = _mm_movelh_ps(r, b2);		// BB BB GG RR

			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r)));
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			
			/* Convert to 8 bit unsigned */
			zero = _mm_setzero_si128();
			__m128i r_i = _mm_cvtps_epi32(r);
			/* To 16 bit signed */
			r_i = _mm_packs_epi32(r_i, zero);
			/* To 8 bit unsigned - set alpha channel*/
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
			*(int*)o = _mm_cvtsi128_si32(r_i);
			i+=4;
			o+=4;
		}
	}
}
Esempio n. 27
0
// =============================================================================
//
// sse2_vChirpData
// version by: Alex Kan - SSE2 mods (haddsum removal) BH
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse2_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);
    __m128d x1, y1;

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(a1, a1);
    a2 = _mm_mul_pd(a2, a2);
    a1 = _mm_mul_pd(a1, rate);
    a2 = _mm_mul_pd(a2, rate);

    // reduce the angle to the range (-0.5, 0.5)
    x1 = _mm_add_pd(a1, roundVal);
    y1 = _mm_add_pd(a2, roundVal);
    x1 = _mm_sub_pd(x1, roundVal);
    y1 = _mm_sub_pd(y1, roundVal);
    a1 = _mm_sub_pd(a1, x1);
    a2 = _mm_sub_pd(a2, y1);

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(y, SS4);
    c = _mm_mul_ps(y, CC3);            
    s = _mm_add_ps(s, SS3);
    c = _mm_add_ps(c, CC2);
    s = _mm_mul_ps(s, y);
    c = _mm_mul_ps(c, y);
    s = _mm_add_ps(s, SS2);
    c = _mm_add_ps(c, CC1);
    s = _mm_mul_ps(s, y);
    c = _mm_mul_ps(c, y);
    s = _mm_add_ps(s, SS1);
    s = _mm_mul_ps(s, x);
    c = _mm_add_ps(c, ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip2(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    c = _mm_mul_ps(c, m);
    s = _mm_mul_ps(s, m);

/*    c1 c2 c3 c4
    s1 s2 s3 s4

    R1 i1 R2 I2    R3 i3 R4 i4

    R1 * c1  +  (i1 * s1 * -1)
    i1 * c1  +   R1 * s1  
    R2 * c2  +  (i2 * s2 * -1)
    i2 * c2  +   R2 * s2
*/

    x = d1;
    y = d2;
    x = _mm_shuffle_ps(x, x, 0xB1);
    y = _mm_shuffle_ps(y, y, 0xB1);
    x = _mm_mul_ps(x, R_NEG);
    y = _mm_mul_ps(y, R_NEG);
    cd1 = _mm_shuffle_ps(c, c, 0x50);  // 01 01 00 00  AaBb => BBbb => c3c3c4c4
    cd2 = _mm_shuffle_ps(c, c, 0xfa);  // 11 11 10 10  AaBb => AAaa => c1c1c2c2
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);

    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    td1 = _mm_mul_ps(td1, x);
    td2 = _mm_mul_ps(td2, y);

    cd1 = _mm_add_ps(cd1, td1);
    cd2 = _mm_add_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped+0, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  if( i < ul_NumDataPoints) {
    // use original routine to finish up any tailings (max stride-1 elements)
    v_ChirpData(cx_DataArray+i, cx_ChirpDataArray+i
      , chirp_rate_ind, chirp_rate, ul_NumDataPoints-i, sample_rate);
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
void decomp_gamma3_plus( spinor_array src, halfspinor_array dst) 
{
  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  __m128 xmm6;
  __m128 xmm7;


  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0
  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);

#endif
 
  /* sub */
  xmm0 = _mm_add_ps(xmm0, xmm3);
  xmm1 = _mm_add_ps(xmm1, xmm4);
  xmm2 = _mm_add_ps(xmm2, xmm5);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);


}
// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
Esempio n. 30
0
//---------------------------------------------------------------------------
void tRisaPhaseVocoderDSP::ProcessCore_sse(int ch)
{
	unsigned int framesize_d2 = FrameSize / 2;
	float * analwork = AnalWork[ch];
	float * synthwork = SynthWork[ch];

	// 丸めモードを設定
	SetRoundingModeToNearest_SSE();

	// FFT を実行する
	rdft(FrameSize, 1, analwork, FFTWorkIp, FFTWorkW); // Real DFT
	analwork[1] = 0.0; // analwork[1] = nyquist freq. power (どっちみち使えないので0に)

	__m128 exact_time_scale = _mm_load1_ps(&ExactTimeScale);
	__m128 over_sampling_radian_v = _mm_load1_ps(&OverSamplingRadian);

	if(FrequencyScale != 1.0)
	{
		// ここでは 4 複素数 (8実数) ごとに処理を行う。
		__m128 over_sampling_radian_recp = _mm_load1_ps(&OverSamplingRadianRecp);
		__m128 frequency_per_filter_band = _mm_load1_ps(&FrequencyPerFilterBand);
		__m128 frequency_per_filter_band_recp = _mm_load1_ps(&FrequencyPerFilterBandRecp);

		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除 +  直交座標系→極座標系
			__m128 aw3120 = *(__m128*)(analwork + i*2    );
			__m128 aw7654 = *(__m128*)(analwork + i*2 + 4);

			__m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1));

			__m128 mag = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210)));
			__m128 ang = VFast_arctan2_F4_SSE(im3210, re3210);

			// 前回の位相との差をとる
			__m128 lastp = *(__m128*)(LastAnalPhase[ch] + i);
			*(__m128*)(LastAnalPhase[ch] + i) = ang;
			ang = _mm_sub_ps(lastp, ang);

			// over sampling の影響を考慮する
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) );

			__m128 phase_shift = _mm_mul_ps(i_3210, over_sampling_radian_v);
			ang = _mm_sub_ps( ang, phase_shift );

			// unwrapping をする
			ang = Wrap_Pi_F4_SSE(ang);

			// -M_PI~+M_PIを-1.0~+1.0の変位に変換
			ang = _mm_mul_ps( ang, over_sampling_radian_recp );

			// tmp をフィルタバンド中央からの周波数の変位に変換し、
			// それにフィルタバンドの中央周波数を加算する
			__m128 freq = _mm_mul_ps( _mm_add_ps(ang, i_3210), frequency_per_filter_band );

			// analwork に値を格納する
			re3210 = mag;
			im3210 = freq;
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(analwork + i*2    ) = im1re1im0re0;
			*(__m128*)(analwork + i*2 + 4) = im3re3im2re2;
		}


		//------------------------------------------------
		// 変換
		//------------------------------------------------
		// 周波数軸方向のリサンプリングを行う
		float FrequencyScale_rcp = 1.0f / FrequencyScale;
		for(unsigned int i = 0; i < framesize_d2; i ++)
		{
			// i に対応するインデックスを得る
			float fi = i * FrequencyScale_rcp;

			// floor(x) と floor(x) + 1 の間でバイリニア補間を行う
			unsigned int index = static_cast<unsigned int>(fi); // floor
			float frac = fi - index;

			if(index + 1 < framesize_d2)
			{
				synthwork[i*2  ] =
					analwork[index*2  ] +
					frac * (analwork[index*2+2]-analwork[index*2  ]);
				synthwork[i*2+1] =
					FrequencyScale * (
					analwork[index*2+1] +
					frac * (analwork[index*2+3]-analwork[index*2+1]) );
			}
			else if(index < framesize_d2)
			{
				synthwork[i*2  ] = analwork[index*2  ];
				synthwork[i*2+1] = analwork[index*2+1] * FrequencyScale;
			}
			else
			{
				synthwork[i*2  ] = 0.0;
				synthwork[i*2+1] = 0.0;
			}
		}

		//------------------------------------------------
		// 合成
		//------------------------------------------------

		// 各フィルタバンドごとに変換
		// 基本的には解析の逆変換である
		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除
			__m128 sw3120 = *(__m128*)(synthwork + i*2    );
			__m128 sw7654 = *(__m128*)(synthwork + i*2 + 4);

			__m128 mag  = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 freq = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(3,1,3,1));

			// i+3 i+2 i+1 i+0 を準備
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps(i_3210, PM128(PFV_INIT));

			// 周波数から各フィルタバンドの中央周波数を減算し、
			// フィルタバンドの中央周波数からの-1.0~+1.0の変位
			// に変換する
			__m128 ang = _mm_sub_ps(_mm_mul_ps(freq, frequency_per_filter_band_recp), i_3210);

			// -1.0~+1.0の変位を-M_PI~+M_PIの位相に変換
			ang = _mm_mul_ps( ang, over_sampling_radian_v );

			// OverSampling による位相の補正
			ang = _mm_add_ps( ang, _mm_mul_ps( i_3210, over_sampling_radian_v ) );

			// TimeScale による位相の補正
			ang = _mm_mul_ps( ang, exact_time_scale );

			// 前回の位相と加算する
			// ここでも虚数部の符号が逆になるので注意
			ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang );
			*(__m128*)(LastSynthPhase[ch] + i) = ang;

			// 極座標系→直交座標系
			__m128 sin, cos;
			VFast_sincos_F4_SSE(ang, sin, cos);
			__m128 re3210 = _mm_mul_ps( mag, cos );
			__m128 im3210 = _mm_mul_ps( mag, sin );

			// インターリーブ
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(synthwork + i*2    ) = im1re1im0re0;
			*(__m128*)(synthwork + i*2 + 4) = im3re3im2re2;
		}
	}
	else
	{
		// 周波数軸方向にシフトがない場合
		// ここでも 4 複素数 (8実数) ごとに処理を行う。
		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除 +  直交座標系→極座標系
			__m128 aw3120 = *(__m128*)(analwork + i*2    );
			__m128 aw7654 = *(__m128*)(analwork + i*2 + 4);

			__m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1));

			__m128 mag = _mm_sqrt_ps( _mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210)) );
			__m128 ang = VFast_arctan2_F4_SSE(im3210, re3210);

			// 前回の位相との差をとる
			__m128 lastp = *(__m128*)(LastAnalPhase[ch] + i);
			*(__m128*)(LastAnalPhase[ch] + i) = ang;
			ang = _mm_sub_ps( lastp, ang );

			// over sampling の影響を考慮する
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) );

			__m128 phase_shift = _mm_mul_ps( i_3210, over_sampling_radian_v );
			ang = _mm_sub_ps( ang, phase_shift );

			// unwrapping をする
			ang = Wrap_Pi_F4_SSE(ang);

			// OverSampling による位相の補正
			ang = _mm_add_ps( ang, phase_shift );

			// TimeScale による位相の補正
			ang = _mm_mul_ps( ang, exact_time_scale );

			// 前回の位相と加算する
			// ここでも虚数部の符号が逆になるので注意
			ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang );
			*(__m128*)(LastSynthPhase[ch] + i) = ang;

			// 極座標系→直交座標系
			__m128 sin, cos;
			VFast_sincos_F4_SSE(ang, sin, cos);
			re3210 = _mm_mul_ps( mag, cos );
			im3210 = _mm_mul_ps( mag, sin );

			// インターリーブ
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(synthwork + i*2    ) = im1re1im0re0;
			*(__m128*)(synthwork + i*2 + 4) = im3re3im2re2;
		}
	}

	// FFT を実行する
	synthwork[1] = 0.0; // synthwork[1] = nyquist freq. power (どっちみち使えないので0に)
	rdft_sse(FrameSize, -1, synthwork, FFTWorkIp, FFTWorkW); // Inverse Real DFT
}