Exemple #1
0
	// ----------------------------------------------------------
	//  Name:   matrix::MaxValue
	//  Desc:   Returns the asbolute maximum element of the
	//				matrix.
	// ----------------------------------------------------------
	float matrix::MaxValue() {
#ifdef _M_IX86
		F32vec4 max1 = _mm_max_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2));
		F32vec4 max2 = _mm_max_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4));
		F32vec4 max = _mm_max_ps(max1, max2);
		max = _mm_max_ps(max, _mm_movehl_ps(max,max));
		max = _mm_max_ss(max, _mm_shuffle_ps(max,max,0x01));
		return max[0];
#else
		float max = this->operator()(0,0);
		for (int i = 0; i < 4; ++i)
			for (int j = 0; j < 4; ++j)
				if (this->operator()(i, j) > max)
					max = this->operator()(i, j);
		return max;
#endif // _M_IX86
	}
Exemple #2
0
	// ----------------------------------------------------------
	//  Name:   matrix::MinValue
	//  Desc:   Returns the asbolute minimum element of the
	//				matrix.
	// ----------------------------------------------------------
	float matrix::MinValue() {
#ifdef _M_IX86
		F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2));
		F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4));
		F32vec4 min = _mm_min_ps(min1, min2);
		min = _mm_min_ps(min, _mm_movehl_ps(min,min));
		min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01));
		return min[0];
#else
		float min = this->operator()(0, 0);
		for (int i = 0; i < 4; ++i)
			for (int j = 0; j < 4; ++j)
				if (this->operator()(i, j) < min) 
					min = this->operator()(i, j);
		return min;
#endif // _M_IX86 
	}
Exemple #3
0
/** Compute an approximate sine (SSE version, four sines a call).
 * This function behaves correctly for the range [-pi pi] only.
 * It has the following properties:
 * <ul>
 *   <li>It has exact values for 0, pi/2, pi, -pi/2, -pi</li>
 *   <li>It has matching derivatives to sine for these same points</li>
 *   <li>Its relative error margin is <= 1% iirc</li>
 *   <li>It computational cost is 5 mults + 3 adds + 2 abs</li>
 * </ul>
 * @param t Radian parameter
 * @return guess what
 */
static inline __m128
sinf_fast_sse(__m128 t)
{
    static const __m128 a = {4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI)};
    static const __m128 p = {0.225f, 0.225f, 0.225f, 0.225f};
    static const __m128 pi = {M_PI, M_PI, M_PI, M_PI};

    // m4 = a*t*(M_PI - fabsf(t));
    __m128 m1 = _mm_abs_ps(t);
    __m128 m2 = _mm_sub_ps(pi, m1);
    __m128 m3 = _mm_mul_ps(t, m2);
    __m128 m4 = _mm_mul_ps(a, m3);

    // p*(m4*fabsf(m4) - m4) + m4;
    __m128 n1 = _mm_abs_ps(m4);
    __m128 n2 = _mm_mul_ps(m4, n1);
    __m128 n3 = _mm_sub_ps(n2, m4);
    __m128 n4 = _mm_mul_ps(p, n3);

    return _mm_add_ps(n4, m4);
}
Exemple #4
0
static inline __m128
bicubic_sse(__m128 width, __m128 t)
{
    static const __m128 half  = { .5f, .5f, .5f, .5f};
    static const __m128 one   = { 1.f, 1.f, 1.f, 1.f};
    static const __m128 two   = { 2.f, 2.f, 2.f, 2.f};
    static const __m128 three = { 3.f, 3.f, 3.f, 3.f};
    static const __m128 four  = { 4.f, 4.f, 4.f, 4.f};
    static const __m128 five  = { 5.f, 5.f, 5.f, 5.f};
    static const __m128 eight = { 8.f, 8.f, 8.f, 8.f};

    t = _mm_abs_ps(t);
    __m128 t2 = _mm_mul_ps(t, t);

    /* Compute 1 < t < 2 case:
     * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
     * half*(t*(mt2 + t5 - eight) + four)
     * half*(t*(mt2 + t5_sub_8) + four)
     * half*(t*(mt2_add_t5_sub_8) + four) */
    __m128 t5 = _mm_mul_ps(five, t);
    __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
    __m128 zero = _mm_setzero_ps();
    __m128 mt2 = _mm_sub_ps(zero, t2);
    __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
    __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
    __m128 b = _mm_add_ps(a, four);
    __m128 r12 = _mm_mul_ps(b, half);

    /* Compute case < 1
     * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
    __m128 t23 = _mm_mul_ps(three, t2);
    __m128 c = _mm_sub_ps(t23, t5);
    __m128 d = _mm_mul_ps(t, c);
    __m128 e = _mm_add_ps(d, two);
    __m128 r01 = _mm_mul_ps(half, e);

    // Compute masks fr keeping correct components
    __m128 mask01 = _mm_cmple_ps(t, one);
    __m128 mask12 = _mm_cmpgt_ps(t, one);
    r01 = _mm_and_ps(mask01, r01);
    r12 = _mm_and_ps(mask12, r12);


    return _mm_or_ps(r01, r12);
}
Exemple #5
0
static inline __m128
bilinear_sse(__m128 width, __m128 t)
{
    static const __m128 one = { 1.f, 1.f, 1.f, 1.f};
    return _mm_sub_ps(one, _mm_abs_ps(t));
}