// ---------------------------------------------------------- // Name: matrix::MaxValue // Desc: Returns the asbolute maximum element of the // matrix. // ---------------------------------------------------------- float matrix::MaxValue() { #ifdef _M_IX86 F32vec4 max1 = _mm_max_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2)); F32vec4 max2 = _mm_max_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4)); F32vec4 max = _mm_max_ps(max1, max2); max = _mm_max_ps(max, _mm_movehl_ps(max,max)); max = _mm_max_ss(max, _mm_shuffle_ps(max,max,0x01)); return max[0]; #else float max = this->operator()(0,0); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) if (this->operator()(i, j) > max) max = this->operator()(i, j); return max; #endif // _M_IX86 }
// ---------------------------------------------------------- // Name: matrix::MinValue // Desc: Returns the asbolute minimum element of the // matrix. // ---------------------------------------------------------- float matrix::MinValue() { #ifdef _M_IX86 F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2)); F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4)); F32vec4 min = _mm_min_ps(min1, min2); min = _mm_min_ps(min, _mm_movehl_ps(min,min)); min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01)); return min[0]; #else float min = this->operator()(0, 0); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) if (this->operator()(i, j) < min) min = this->operator()(i, j); return min; #endif // _M_IX86 }
/** Compute an approximate sine (SSE version, four sines a call). * This function behaves correctly for the range [-pi pi] only. * It has the following properties: * <ul> * <li>It has exact values for 0, pi/2, pi, -pi/2, -pi</li> * <li>It has matching derivatives to sine for these same points</li> * <li>Its relative error margin is <= 1% iirc</li> * <li>It computational cost is 5 mults + 3 adds + 2 abs</li> * </ul> * @param t Radian parameter * @return guess what */ static inline __m128 sinf_fast_sse(__m128 t) { static const __m128 a = {4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI), 4.f/(M_PI*M_PI)}; static const __m128 p = {0.225f, 0.225f, 0.225f, 0.225f}; static const __m128 pi = {M_PI, M_PI, M_PI, M_PI}; // m4 = a*t*(M_PI - fabsf(t)); __m128 m1 = _mm_abs_ps(t); __m128 m2 = _mm_sub_ps(pi, m1); __m128 m3 = _mm_mul_ps(t, m2); __m128 m4 = _mm_mul_ps(a, m3); // p*(m4*fabsf(m4) - m4) + m4; __m128 n1 = _mm_abs_ps(m4); __m128 n2 = _mm_mul_ps(m4, n1); __m128 n3 = _mm_sub_ps(n2, m4); __m128 n4 = _mm_mul_ps(p, n3); return _mm_add_ps(n4, m4); }
static inline __m128 bicubic_sse(__m128 width, __m128 t) { static const __m128 half = { .5f, .5f, .5f, .5f}; static const __m128 one = { 1.f, 1.f, 1.f, 1.f}; static const __m128 two = { 2.f, 2.f, 2.f, 2.f}; static const __m128 three = { 3.f, 3.f, 3.f, 3.f}; static const __m128 four = { 4.f, 4.f, 4.f, 4.f}; static const __m128 five = { 5.f, 5.f, 5.f, 5.f}; static const __m128 eight = { 8.f, 8.f, 8.f, 8.f}; t = _mm_abs_ps(t); __m128 t2 = _mm_mul_ps(t, t); /* Compute 1 < t < 2 case: * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f) * half*(t*(mt2 + t5 - eight) + four) * half*(t*(mt2 + t5_sub_8) + four) * half*(t*(mt2_add_t5_sub_8) + four) */ __m128 t5 = _mm_mul_ps(five, t); __m128 t5_sub_8 = _mm_sub_ps(t5, eight); __m128 zero = _mm_setzero_ps(); __m128 mt2 = _mm_sub_ps(zero, t2); __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8); __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8); __m128 b = _mm_add_ps(a, four); __m128 r12 = _mm_mul_ps(b, half); /* Compute case < 1 * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */ __m128 t23 = _mm_mul_ps(three, t2); __m128 c = _mm_sub_ps(t23, t5); __m128 d = _mm_mul_ps(t, c); __m128 e = _mm_add_ps(d, two); __m128 r01 = _mm_mul_ps(half, e); // Compute masks fr keeping correct components __m128 mask01 = _mm_cmple_ps(t, one); __m128 mask12 = _mm_cmpgt_ps(t, one); r01 = _mm_and_ps(mask01, r01); r12 = _mm_and_ps(mask12, r12); return _mm_or_ps(r01, r12); }
static inline __m128 bilinear_sse(__m128 width, __m128 t) { static const __m128 one = { 1.f, 1.f, 1.f, 1.f}; return _mm_sub_ps(one, _mm_abs_ps(t)); }