void SubVectorsSIMD(float* c, const float* a, const float* b, std::size_t n) { std::size_t i = 0; for (; i < ROUND_DOWN(n, 4); i += 4) { __m128 ma = _mm_loadu_ps(a + i); __m128 mb = _mm_loadu_ps(b + i); __m128 mc = _mm_sub_ps(ma, mb); _mm_storeu_ps(c + i, mc); } for (; i < n; i++) { c[i] = a[i] - b[i]; } }
static void blendMatrices(MMatrix4x4 * matrix, const MMatrix4x4 * skinMatrix, const float weight) { __m128 w = _mm_set1_ps(weight); for(int i=0; i<16; i+=4) { __m128 a = _mm_loadu_ps(matrix->entries + i); __m128 b = _mm_loadu_ps(skinMatrix->entries + i); __m128 c = _mm_mul_ps(b, w); __m128 d = _mm_add_ps(a, c); _mm_storeu_ps(matrix->entries + i, d); } }
static inline void transpose_sse (gfloat *src, gfloat *dst, const int width, const int height) { __m128 row1 = _mm_loadu_ps (src); __m128 row2 = _mm_loadu_ps (src + height); __m128 row3 = _mm_loadu_ps (src + 2 * height); __m128 row4 = _mm_loadu_ps (src + 3 * height); _MM_TRANSPOSE4_PS (row1, row2, row3, row4); _mm_storeu_ps (dst, row1); _mm_storeu_ps (dst + width, row2); _mm_storeu_ps (dst + 2 * width, row3); _mm_storeu_ps (dst + 3 * width, row4); }
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer) { __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned phase = resamp->time >> PHASES_SHIFT; unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK; __m128 delta_f = _mm_set1_ps(delta); const float *phase_table = resamp->phase_table[phase][PHASE_INDEX]; const float *delta_table = resamp->phase_table[phase][DELTA_INDEX]; for (unsigned i = 0; i < TAPS; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); __m128 phases = _mm_load_ps(phase_table + i); __m128 deltas = _mm_load_ps(delta_table + i); __m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } // Them annoying shuffles :V // sum_l = { l3, l2, l1, l0 } // sum_r = { r3, r2, r1, r0 } __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); // sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } // sum = { R1, R0, L1, L0 } sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); // sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } // sum = { X, R, X, L } // Store L _mm_store_ss(out_buffer + 0, sum); // movehl { X, R, X, L } == { X, R, X, R } _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
static void spline_n_4(int i, float t, float *knot, float *splineVal) { knot += i + 1; #ifdef _M_SSE const __m128 knot012 = _mm_loadu_ps(&knot[0]); const __m128 knot345 = _mm_loadu_ps(&knot[3]); const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012); const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012)); const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0)); const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122)); // It's still faster to use SSE, even with this. float MEMORY_ALIGNED16(ff30_41_52[4]); float MEMORY_ALIGNED16(ff31_42_32[4]); _mm_store_ps(ff30_41_52, f30_41_52); _mm_store_ps(ff31_42_32, f31_42_32); const float &f30 = ff30_41_52[0]; const float &f41 = ff30_41_52[1]; const float &f52 = ff30_41_52[2]; const float &f31 = ff31_42_32[0]; const float &f42 = ff31_42_32[1]; const float &f32 = ff31_42_32[2]; #else // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... float t0 = (t - knot[0]); float t1 = (t - knot[1]); float t2 = (t - knot[2]); // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) float f30 = t0/(knot[3]-knot[0]); float f41 = t1/(knot[4]-knot[1]); float f52 = t2/(knot[5]-knot[2]); float f31 = t1/(knot[3]-knot[1]); float f42 = t2/(knot[4]-knot[2]); float f32 = t2/(knot[3]-knot[2]); #endif float a = (1-f30)*(1-f31); float b = (f31*f41); float c = (1-f41)*(1-f42); float d = (f42*f52); splineVal[0] = a-(a*f32); splineVal[1] = 1-a-b+((a+b+c-1)*f32); splineVal[2] = b+((1-b-c-d)*f32); splineVal[3] = d*f32; }
void audio_mix_volume_SSE2(float *out, const float *in, float vol, size_t samples) { size_t i; __m128 volume = _mm_set1_ps(vol); for (i = 0; i + 16 <= samples; i += 16, out += 16, in += 16) { unsigned j; __m128 input[4]; __m128 additive[4]; input[0] = _mm_loadu_ps(out + 0); input[1] = _mm_loadu_ps(out + 4); input[2] = _mm_loadu_ps(out + 8); input[3] = _mm_loadu_ps(out + 12); additive[0] = _mm_mul_ps(volume, _mm_loadu_ps(in + 0)); additive[1] = _mm_mul_ps(volume, _mm_loadu_ps(in + 4)); additive[2] = _mm_mul_ps(volume, _mm_loadu_ps(in + 8)); additive[3] = _mm_mul_ps(volume, _mm_loadu_ps(in + 12)); for (j = 0; j < 4; j++) _mm_storeu_ps(out + 4 * j, _mm_add_ps(input[j], additive[j])); } audio_mix_volume_C(out, in, vol, samples - i); }
void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) { __m128 c1 = _mm_set1_ps(0.3f); __m128 c2 = _mm_set1_ps(0.59f); __m128 c3 = _mm_set1_ps(0.11f); for(int i = 0; i < N; i+=4) { __m128 a = _mm_loadu_ps(ra+i); __m128 b = _mm_loadu_ps(ga+i); __m128 c = _mm_loadu_ps(ba+i); __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b)); __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c)); _mm_storeu_ps(gray+i, out); } }
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 12; __m128 sum0, sum1, sum2; (void) lag; FLAC__ASSERT(lag <= 12); FLAC__ASSERT(lag <= data_len); sum0 = _mm_setzero_ps(); sum1 = _mm_setzero_ps(); sum2 = _mm_setzero_ps(); for(i = 0; i <= limit; i++) { __m128 d, d0, d1, d2; d0 = _mm_loadu_ps(data+i); d1 = _mm_loadu_ps(data+i+4); d2 = _mm_loadu_ps(data+i+8); d = d0; d = _mm_shuffle_ps(d, d, 0); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); } { __m128 d0 = _mm_setzero_ps(); __m128 d1 = _mm_setzero_ps(); __m128 d2 = _mm_setzero_ps(); limit++; if(limit < 0) limit = 0; for(i = data_len-1; i >= limit; i--) { __m128 d; d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); d2 = _mm_move_ss(d2, d1); d1 = _mm_move_ss(d1, d0); d0 = _mm_move_ss(d0, d); sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); } } _mm_storeu_ps(autoc, sum0); _mm_storeu_ps(autoc+4, sum1); _mm_storeu_ps(autoc+8, sum2); }
// inline matrix multiplication is much faster than function calling, // we should only move it if we need size eMatrix4x4 & operator *= (const eMatrix4x4 &mtx) { #ifdef eUSE_SSE const __m128 in10 = _mm_loadu_ps(&mtx.m11); const __m128 in11 = _mm_loadu_ps(&mtx.m21); const __m128 in12 = _mm_loadu_ps(&mtx.m31); const __m128 in13 = _mm_loadu_ps(&mtx.m41); for (eU32 i=0; i<16; i+=4) { const __m128 in2 = _mm_loadu_ps(&m[i]); const __m128 e0 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); const __m128 e1 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); const __m128 e2 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); const __m128 e3 = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3)); const __m128 m0 = _mm_mul_ps(in10, e0); const __m128 m1 = _mm_mul_ps(in11, e1); const __m128 m2 = _mm_mul_ps(in12, e2); const __m128 m3 = _mm_mul_ps(in13, e3); const __m128 a0 = _mm_add_ps(m0, m1); const __m128 a1 = _mm_add_ps(m2, m3); const __m128 a2 = _mm_add_ps(a0, a1); _mm_storeu_ps(&this->m[i], a2); } #else *this = eMatrix4x4(m11*mtx.m11+m12*mtx.m21+m13*mtx.m31+m14*mtx.m41, m11*mtx.m12+m12*mtx.m22+m13*mtx.m32+m14*mtx.m42, m11*mtx.m13+m12*mtx.m23+m13*mtx.m33+m14*mtx.m43, m11*mtx.m14+m12*mtx.m24+m13*mtx.m34+m14*mtx.m44, m21*mtx.m11+m22*mtx.m21+m23*mtx.m31+m24*mtx.m41, m21*mtx.m12+m22*mtx.m22+m23*mtx.m32+m24*mtx.m42, m21*mtx.m13+m22*mtx.m23+m23*mtx.m33+m24*mtx.m43, m21*mtx.m14+m22*mtx.m24+m23*mtx.m34+m24*mtx.m44, m31*mtx.m11+m32*mtx.m21+m33*mtx.m31+m34*mtx.m41, m31*mtx.m12+m32*mtx.m22+m33*mtx.m32+m34*mtx.m42, m31*mtx.m13+m32*mtx.m23+m33*mtx.m33+m34*mtx.m43, m31*mtx.m14+m32*mtx.m24+m33*mtx.m34+m34*mtx.m44, m41*mtx.m11+m42*mtx.m21+m43*mtx.m31+m44*mtx.m41, m41*mtx.m12+m42*mtx.m22+m43*mtx.m32+m44*mtx.m42, m41*mtx.m13+m42*mtx.m23+m43*mtx.m33+m44*mtx.m43, m41*mtx.m14+m42*mtx.m24+m43*mtx.m34+m44*mtx.m44); #endif return *this; }
__attribute__((noinline)) float dot128fma(float *x1, float *x2, size_t len) { assert(len % 4 == 0); __m128 sum = _mm_setzero_ps(); if (len > 3) { size_t limit = len - 3; for (size_t i = 0; i < limit; i += 4) { __m128 v1 = _mm_loadu_ps(x1 + i); __m128 v2 = _mm_loadu_ps(x2 + i); sum = _mm_fmadd_ps(v1, v2, sum); } } float buffer[4]; _mm_storeu_ps(buffer, sum); return buffer[0] + buffer[1] + buffer[2] + buffer[3]; }
static inline long conv_yF_yHalf (const float *src, uint16_t *dst, long samples) { const __v4sf *s_vec; uint64_t *d_vec; long n = samples; s_vec = (const __v4sf *)src; d_vec = (uint64_t *)dst; while (n >= 4) { __m128 in_val = _mm_loadu_ps((float *)s_vec++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_storel_epi64((__m128i *)d_vec++, out_val); n -= 4; } src = (const float *)s_vec; dst = (uint16_t *)d_vec; while (n) { __m128 in_val = _mm_load_ss(src++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); *dst++ = _mm_extract_epi16(out_val, 0); n -= 1; } return samples; }
SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n) { int i; __m128 n1; __m128 n2; if ((i = n & ~3)) { n2 = _mm_set1_ps(y); for (i -= 4; i >= 0; i -= 4) { n1 = _mm_loadu_ps(x + i); n1 = _mm_sub_ps(n1, n2); _mm_storeu_ps(z + i, n1); } } /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ switch (n & 3) { case 3: z[n - 3] = x[n - 3] - y; case 2: z[n - 2] = x[n - 2] - y; case 1: z[n - 1] = x[n - 1] - y; } }
SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n) { int i; static const uint32_t mask = 0x80000000; static const float *fmask = (float *) &mask; __m128 n1; __m128 n2; if ((i = n & ~3)) { n2 = _mm_set1_ps(*fmask); for (i -= 4; i >= 0; i -= 4) { n1 = _mm_loadu_ps(x + i); n1 = _mm_xor_ps(n1, n2); _mm_storeu_ps(z + i, n1); } } /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ switch (n & 3) { case 3: z[n - 3] = -x[n - 3]; case 2: z[n - 2] = -x[n - 2]; case 1: z[n - 1] = -x[n - 1]; } }
void AngleQuaternion(vec_t *angles, vec_t *quaternion) { static const ALIGN16_BEG int ps_signmask[4] ALIGN16_END = { 0x80000000, 0, 0x80000000, 0 }; __m128 a = _mm_loadu_ps(angles); a = _mm_mul_ps(a, _mm_load_ps(_ps_0p5)); //a *= 0.5 __m128 s, c; sincos_ps(a, &s, &c); __m128 im1 = _mm_shuffle_ps(s, c, _MM_SHUFFLE(1, 0, 1, 0)); //im1 = {sin[0], sin[1], cos[0], cos[1] } __m128 im2 = _mm_shuffle_ps(c, s, _MM_SHUFFLE(2, 2, 2, 2)); //im2 = {cos[2], cos[2], sin[2], sin[2] } __m128 part1 = _mm_mul_ps( _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(1, 2, 2, 0)), _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(0, 3, 1, 3)) ); part1 = _mm_mul_ps(part1, im2); __m128 part2 = _mm_mul_ps( _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(2, 1, 0, 2)), _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(3, 0, 3, 1)) ); part2 = _mm_mul_ps(part2, _mm_shuffle_ps(im2, im2, _MM_SHUFFLE(0, 0, 2, 2))); __m128 signmask = _mm_load_ps((float*)ps_signmask); part2 = _mm_xor_ps(part2, signmask); __m128 res = _mm_add_ps(part1, part2); _mm_storeu_ps(quaternion, res); }
static void TEST (void) { union128d s1; union128 u, s2; double source1[2] = {123.345, 67.3321}; float e[4] = {5633.098, 93.21, 3.34, 4555.2}; s1.x = _mm_loadu_pd (source1); s2.x = _mm_loadu_ps (e); __asm("" : "+v"(s1.x), "+v"(s2.x)); u.x = test(s2.x, s1.x); e[0] = (float)source1[0]; if (check_union128(u, e)) #if DEBUG { printf ("sse2_test_cvtsd2ss_1; check_union128 failed\n"); printf ("\t [%f,%f,%f,%f],[%f,%f]\n", s2.a[0], s2.a[1], s2.a[2], s2.a[3], s1.a[0], s1.a[1]); printf ("\t -> \t[%f,%f,%f,%f]\n", u.a[0], u.a[1], u.a[2], u.a[3]); printf ("\texpect\t[%f,%f,%f,%f]\n", e[0], e[1], e[2], e[3]); } #else abort (); #endif }
void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4); input += 8; sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value))); sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value))); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee)); _mm_store_sd((double*) output, _mm_castps_pd(sum2)); }
static void clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) { __m128 xmm1; float max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set_ps1(max); for (; n >= 4; n -= 4) { __m128 xmm0; xmm0 = _mm_loadu_ps(src1); xmm0 = _mm_min_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } }
void conv_Float1ToFloat2(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const LSfloat* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_loadu_ps(p); __m128 f32_1 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(1, 1, 0, 0)); __m128 f32_2 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(3, 3, 2, 2)); _mm_storeu_ps((q+0), f32_1); _mm_storeu_ps((q+4), f32_2); p += 4; q += 8; } for(s32 i=0; i<rem; ++i){ s32 j=i<<1; q[j+0] = q[j+1] = p[i]; } }
static void GF_FUNC_ALIGN VS_CC proc_horizontal(float *srcp, int radius, int length, int width, float *kernel, float *dstp) { for (int i = 1; i <= radius; i++) { srcp[-i] = srcp[i]; srcp[width - 1 + i] = srcp[width - 1 - i]; } GF_ALIGN float ar_kernel[17][4]; for (int i = 0; i < length; i++) { for (int j = 0; j < 4; j++) { ar_kernel[i][j] = kernel[i]; } } for (int x = 0; x < width; x += 4) { __m128 sum = _mm_setzero_ps(); for (int i = -radius; i <= radius; i++) { __m128 k = _mm_load_ps(ar_kernel[i + radius]); __m128 xmm0 = _mm_loadu_ps(srcp + x + i); sum = _mm_add_ps(sum, _mm_mul_ps(xmm0, k)); } _mm_store_ps(dstp + x, sum); } }
void conv_Float2ToFloat1(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; __m128 coff = _mm_set1_ps(0.5f); const LSfloat* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_loadu_ps(p); __m128 f32_1 = _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 1, 0)); __m128 f32_2 = _mm_mul_ps(_mm_add_ps(f32_0, f32_1), coff); __m128 f32_3 = _mm_shuffle_ps(f32_2, f32_2, _MM_SHUFFLE(2, 0, 2, 0)); _mm_storel_pi((__m64*)q, f32_3); p += 4; q += 2; } for(s32 i=0; i<rem; ++i){ s32 j=i<<1; q[i] = 0.5f*(p[j+0]+p[j+1]); } }
void conv_Float1ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 //ストア処理用に4サンプル除外 if(0<num){ --num; } s32 offset = num << 2; s32 rem = numSamples - offset; const __m128 fcoff = _mm_set1_ps(32768.0f); const LSfloat* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff); __m128i s32_0 = _mm_cvtps_epi32(f32_0); __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0); _mm_storeu_si128((__m128i*)q, s16_0); p += 4; q += 4; } for(s32 i=0; i<rem; ++i){ q[i] = toShort(p[i]); } }
static inline void AccumulateWeighted(Vec3f &out, const Vec3Packedf &in, const Vec4f &w) { #ifdef _M_SSE out.vec = _mm_add_ps(out.vec, _mm_mul_ps(_mm_loadu_ps(in.AsArray()), w.vec)); #else out += in * w.x; #endif }
float sumSSE(float* floats, int size) { int i,q,r; float f[4] = {0}; //if(size<10) { float ax; while(size) { ax+=floats[size]; --size; } return ax; } q = 4*(size/4); // whole r = size - q; // remainder // load and sum first 8 __m128 x = _mm_load_ps( floats); __m128 y = _mm_load_ps( floats + 4); x = _mm_add_ps(x,y); // sum remaining whole blocks one at a time (use j which is size-r...) for(i=8; i<q; i+=4) { y = _mm_load_ps(floats + i ); x = _mm_add_ps(x,y); } //printf("size:%d q:%d r: %d\n",size,q,r); // if we have a remainder add it to our sum for(; r; --r) f[r] = floats[size-r]; y = _mm_loadu_ps(f); x = _mm_add_ps(x,y); // move back into float array, and return the sum _mm_store_ps(f,x); return f[0] + f[1] + f[2] + f[3]; }
void *calculate_row(void *thread_id) { long id = (long)thread_id; int i, j, ii, jj, ii_limit, jj_limit, l, start_i, end_i, jj_limit_minus_3; __m128 a_line, b_line, r_line; const float *A = matrix_a, *B= matrix_b; const int n = matrix_n, m = matrix_m, k = matrix_k; float *row_in_B, *row_in_C, ii_l_in_A; start_i = row_per_thread * id; end_i = row_per_thread * (id + 1); if(end_i >= m) end_i = m; for(i = start_i ; i < end_i ; i += TILE_SIZE) // i: row block index in C { ii_limit = min(end_i, i + TILE_SIZE); for(j = 0 ; j < n ; j += TILE_SIZE2) // j : col block index in C { jj_limit = min(n, j + TILE_SIZE2); jj_limit_minus_3 = jj_limit - 3; for(l = 0; l < k; ++l) { row_in_B = B + l*n; for(ii = i ; ii < ii_limit; ++ii) { ii_l_in_A = A[ii * k + l]; a_line = _mm_set1_ps(ii_l_in_A); //A[ii * k + l]); row_in_C = matrix_c + ii * n; for(jj = j; jj < jj_limit_minus_3 ; jj += 4) { b_line = _mm_loadu_ps(row_in_B + jj); r_line = _mm_loadu_ps(row_in_C + jj); _mm_storeu_ps(row_in_C + jj, _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line)); } for(; jj < jj_limit; ++jj) { *(row_in_C + jj) += ii_l_in_A * *(row_in_B + jj); } } } } } pthread_exit(NULL); }
/* the fast arctan function adopted from OpenCV */ static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len) { int i = 0; float scale = (float)(180.0 / CCV_PI); #ifdef HAVE_SSE2 #ifndef _WIN32 union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff; __m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl); __m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2)); __m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale); for(; i <= len - 4; i += 4) { __m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i); __m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4); __m128 xly = _mm_cmplt_ps(xq4, yq4); __m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps)); // a4 <- x < y ? 90 : 0; __m128 a4 = _mm_and_ps(xly, _90); // a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0)) __m128 mask = _mm_cmplt_ps(y4, zero); a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < 0 && !(x < y) ? 180 : a4) mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero)); a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < y ? a4 - z4 : a4 + z4) a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4); __m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4)); _mm_storeu_ps(angle + i, a4); _mm_storeu_ps(mag + i, m4); } #endif #endif for(; i < len; i++) { float xf = x[i], yf = y[i]; float a, x2 = xf * xf, y2 = yf * yf; if(y2 <= x2) a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2); else a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6); angle[i] = a * scale; mag[i] = sqrtf(x2 + y2); } }
void matrix_mult_simd(float A[SIZE][SIZE], float B[SIZE][SIZE], float ans[SIZE][SIZE]) { float temp[4] = {0}; __m128 acc, a, b; for (int i = 0 ; i < SIZE ; i++) { for (int j = 0; j < SIZE; j ++) { acc = _mm_set1_ps(0.0); for (int k = 0; k < (SIZE - 3); k +=4) { a = _mm_loadu_ps(&A[j][k]); b = _mm_loadu_ps(&B[j][k]); acc = _mm_add_ps(acc, _mm_mul_ps(a, b)); } _mm_storeu_ps(temp, acc); ans[i][j] = temp[0] + temp[1] + temp[2] + temp[3]; } } }
static void sse3_test_movsldup_reg (float *i1, float *r) { __m128 t1 = _mm_loadu_ps (i1); __m128 t2 = _mm_moveldup_ps (t1); _mm_storeu_ps (r, t2); }
void LoadRenderParams(float inScaleVal, float outScaleVal, const RenderParams & renderParams, __m128 & inScale, __m128 & outScale, __m128 & slope, __m128 & offset, __m128 & power, __m128 & saturation) { inScale = _mm_set1_ps(inScaleVal); outScale = _mm_set1_ps(outScaleVal); slope = _mm_loadu_ps(renderParams.getSlope()); offset = _mm_loadu_ps(renderParams.getOffset()); power = _mm_loadu_ps(renderParams.getPower()); saturation = _mm_set1_ps(renderParams.getSaturation()); }
void _Run(OutputPixelType aaOutput[ciHeight][ciWidth], InputPixelType_1 aaInput1[ciHeight][ciWidth], InputPixelType_2 aaInput2[ciHeight][ciWidth]) { for (int iY = 0; iY < ciHeight; ++iY) { OutputPixelType *pOutput = aaOutput[iY]; InputPixelType_1 *pInput1 = aaInput1[iY]; InputPixelType_2 *pInput2 = aaInput2[iY]; for (int iX = 0; iX < ciWidth; iX += VectorWidth) { __m128 mmIn1 = _mm_loadu_ps( pInput1 + iX ); __m128 mmIn2 = _mm_loadu_ps( pInput2 + iX ); _mm_storeu_ps( pOutput + iX, _mm_add_ps(mmIn1, mmIn2) ); } } }
inline __m128 sse_load( float * i ) { #ifdef CODE_ALIGNED_SIMD_INSTRUCTIONS return _mm_load_ps( i ); #else return _mm_loadu_ps( i ); #endif }