template <> vector3d<float> vector3d<float>::transform(const matrix3d<float> &M) const { float _x, _y, _z; __m128 v1, v2, rx, ry, rz; v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[0]); rx = _mm_dp_ps(v1,v2,0xF1); v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[4]); ry = _mm_dp_ps(v1,v2,0xF1); v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[8]); rz = _mm_dp_ps(v1,v2,0xF1); _mm_store_ss(&_x,rx); _mm_store_ss(&_y,ry); _mm_store_ss(&_z,rz); vector3d<float> q; q.set(_x,_y,_z); return q; }
void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; __m128 xsum1, xsum2; xsum1 = _mm_setzero_ps(); xsum2 = _mm_setzero_ps(); for (i=0;i<N-3;i+=4) { __m128 xi = _mm_loadu_ps(x+i); __m128 y1i = _mm_loadu_ps(y01+i); __m128 y2i = _mm_loadu_ps(y02+i); xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); } /* Horizontal sum */ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); _mm_store_ss(xy1, xsum1); xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); _mm_store_ss(xy2, xsum2); for (;i<N;i++) { *xy1 = MAC16_16(*xy1, x[i], y01[i]); *xy2 = MAC16_16(*xy2, x[i], y02[i]); } }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m128 delta = _mm_set1_ps((float)(resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m128 deltas = _mm_load_ps(delta_table + i); __m128 sinc = _mm_add_ps(_mm_load_ps(phase_table + i), _mm_mul_ps(deltas, delta)); #else __m128 sinc = _mm_load_ps(phase_table + i); #endif sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } // Them annoying shuffles :V // sum_l = { l3, l2, l1, l0 } // sum_r = { r3, r2, r1, r0 } __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); // sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } // sum = { R1, R0, L1, L0 } sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); // sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } // sum = { X, R, X, L } // Store L _mm_store_ss(out_buffer + 0, sum); // movehl { X, R, X, L } == { X, R, X, R } _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer) { __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned phase = resamp->time >> PHASES_SHIFT; unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK; __m128 delta_f = _mm_set1_ps(delta); const float *phase_table = resamp->phase_table[phase][PHASE_INDEX]; const float *delta_table = resamp->phase_table[phase][DELTA_INDEX]; for (unsigned i = 0; i < TAPS; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); __m128 phases = _mm_load_ps(phase_table + i); __m128 deltas = _mm_load_ps(delta_table + i); __m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } // Them annoying shuffles :V // sum_l = { l3, l2, l1, l0 } // sum_r = { r3, r2, r1, r0 } __m128 sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); // sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } // sum = { R1, R0, L1, L0 } sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); // sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } // sum = { X, R, X, L } // Store L _mm_store_ss(out_buffer + 0, sum); // movehl { X, R, X, L } == { X, R, X, R } _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
F32 round(F32 val) { #ifdef USE_SSE4 __m128 t = _mm_set_ss(val); t = _mm_round_ss(t, t, _MM_FROUND_TO_NEAREST_INT); _mm_store_ss(&val, t); #elif defined(USE_SSE2) __m128 t = _mm_set_ss(val); U32 i = (U32)_mm_cvtss_si32(t); t = _mm_cvtsi32_ss(t, (int32)i); _mm_store_ss(&val, t); #else val = (F32)core_floor(val + 0.5f); #endif return val; }
int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max) { const __m128 min = _mm_load_ps(box_min); const __m128 max = _mm_load_ps(box_max); for (int i = 0; i < 6; i++) { const __m128 plane = _mm_load_ps(frustum + 4 * i); const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps()); const __m128 n = _mm_or_ps(_mm_and_ps(mask, max), _mm_andnot_ps(mask, min)); const __m128 d = _mm_mul_ps(n, plane); const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3)); const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2)); const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1)); const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2)); const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps()); float reti; _mm_store_ss(&reti, ret); if (reti != 0) return 0; } return 1; }
static inline long conv_yHalf_yF (const uint16_t *src, float *dst, long samples) { const uint64_t *s_vec; __v4sf *d_vec; long n = samples; s_vec = (const uint64_t *)src; d_vec = (__v4sf *)dst; while (n >= 4) { __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0); __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); _mm_storeu_ps((float *)d_vec++, out_val); n -= 4; } src = (const uint16_t *)s_vec; dst = (float *)d_vec; while (n) { __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0); __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); _mm_store_ss(dst++, out_val); n -= 1; } return samples; }
void VertexFuncSmoothSphereMapAOS( const VERTEXINPUT &in, VERTEXOUTPUT &out) { const FLOAT *pWorldMat = (FLOAT *)in.pConstants; const FLOAT *pLightPos = (FLOAT *)in.pConstants + CONST_OFFSET_LIGHT_POSITION; Mat4Vec3Multiply(out.pVertices, pWorldMat, (FLOAT *)in.pAttribs); // Transform normals in attrib buffer by world matrix. OSALIGNLINE(FLOAT) normals[4]; Mat4Vec3Multiply(&normals[0], pWorldMat, &in.pAttribs[SHADER_INPUT_SLOT_NORMAL]); // Normalize normals __m128 normal = Vec3Normalize(&normals[0]); __m128 lightPos = _mm_load_ps(pLightPos); __m128 lightDir = _mm_load_ps(in.pAttribs); // Compute a light direction vector for each vertex. // lightDir = Normalize(lightPos - vertexPos) lightDir = _mm_sub_ps(lightPos, lightDir); lightDir = Vec3Normalize(lightDir); __m128 ndotl0 = Vec3Diffuse(normal, lightDir); _mm_store_ps(&out.pAttribs[0], _mm_load_ps(&in.pAttribs[0])); _mm_store_ss(&out.pAttribs[0x2], ndotl0); }
/* Combines unpack and accumulate */ void vector_accumulate_8bit(float *out, const char *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; float ftmp; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { // Cast these without intrinsics ftmp = (float)(*in); out_ = _mm_load_ss(out); in_ = _mm_load_ss(&ftmp); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += (float)in[i]; } #endif }
int drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments) { int32_t i; float d; moments_t onlinemoments; __m128 x, y, r, r2, s; moments_clear(&onlinemoments); x = load_float3(&coords[3 * index]); for (i = 0; i < n_partners; i++) { y = load_float3(&coords[3 * partners[i]]); r = _mm_sub_ps(x, y); /* x - y */ r2 = _mm_mul_ps(r, r); /* (x - y)**2 */ /* horizontal add the components of d2 with */ /* two instructions. note: it's critical */ /* here that the last entry of x1 and x2 was 0 */ /* so that d2.w = 0 */ s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2)); s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1)); /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not accurate to pass the tests */ _mm_store_ss(&d, s); moments_push(&onlinemoments, 1.0 / sqrt((double) d)); } moments[0] = moments_mean(&onlinemoments); moments[1] = sqrt(moments_second(&onlinemoments)); moments[2] = cbrt(moments_third(&onlinemoments)); return 1; }
__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) { // A+B C+D sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2))); // A+B+C+D A+B+C+D sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1))); _mm_store_ss(dst, sum); }
void ColorModelView::paintEvent(QPaintEvent *) { QPainter p(this); auto mainBounds = mainAreaBounds(); auto sideBounds = sideAreaBounds(); if (mainImage_.isNull()) { // FIXME: support other color model? QImage img(256, 256, QImage::Format_RGB32); auto *pixels = reinterpret_cast<quint32 *>(img.bits()); auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255); auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0); basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256] auto white = _mm_set1_epi32(256 * 255); auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256)); for (int y = 0; y < 256; ++y) { auto brightness = _mm_set1_epi32(256 - y - (y >> 7)); auto col = white; // [0, 256 * 255] for (int x = 0; x < 256; ++x) { auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness); c = _mm_srli_epi16(c, 8); // [0, 255] c = _mm_packs_epi32(c, c); c = _mm_packus_epi16(c, c); _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]), _mm_castsi128_ps(c)); col = _mm_add_epi32(col, dX); } } mainImage_ = QPixmap::fromImage(img); }
F32 ceil(F32 val) { #ifdef USE_SSE4 __m128 t = _mm_set_ss(val); t = _mm_ceil_ss(t, t); _mm_store_ss(&val, t); #elif defined(USE_SSE2) val += 0.5f; __m128 t = _mm_set_ss(val); U32 i = (U32)_mm_cvtss_si32(t); t = _mm_cvtsi32_ss(t, (int32)i); _mm_store_ss(&val, t); #else val = (F32)core_ceil(val); #endif return val; }
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const { assert(FitMatrix::PADDEDCOLS == 32); __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale); const short* localFitMatrix = fitMatrix(); // Load data into four SSE Registers __m128i x[4]; signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0)); x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8)); x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16)); x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24)); x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10); // Clear dataFlat[27..31] for(int i = 0; i < FitMatrix::ROWS; i++) { // Compute scalar product between ((float*)x)[0..31] and localFitMatrix __m128i sum = _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0)); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24))); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale)); localFitMatrix += 32; } }
/* use compiler intrinsics for 4x parallel processing */ static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) { float result=0; const __m128 eps = _mm_set1_ps(FLT_MIN); const __m128 zero = _mm_setzero_ps(); __m128 chi2 = _mm_setzero_ps(); for (; n>3; n-=4) { const __m128 a = _mm_loadu_ps(x); const __m128 b = _mm_loadu_ps(y); const __m128 a_plus_eps = _mm_add_ps(a,eps); const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b); const __m128 a_minus_b = _mm_sub_ps(a,b); const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b); const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_ps(chi2, prod); x+=4; y+=4; } const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2)); const __m128 sum1 = _mm_add_ps(chi2, shuffle1); const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1)); const __m128 sum2 = _mm_add_ps(sum1, shuffle2); // with SSE3, we could use hadd_ps, but the difference is negligible _mm_store_ss(&result,sum2); _mm_empty(); if (n) result += chi2_baseline_float(n, x, y); // remaining 1-3 entries return result; }
void vector_accumulate(float *out, const float *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { in_ = _mm_load_ss(in); out_ = _mm_load_ss(out); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += in[i]; } #endif }
float SSEVector3::operator *( const SSEVector3& v ) const { float result[ 4 ]; // Store in lowest float, do not multiply fourth value: 1110 0001 const int mask = 0x71; _mm_store_ss( result, _mm_dp_ps( vec, v.vec, mask ) ); return result[ 0 ]; }
float SSEVector3::Length() const { float result[ 4 ]; float lengthSquared = LengthSquared(); // Store in all floats, do not multiply fourth value: 0111 1111 const int mask = 0x7F; _mm_store_ss( result, _mm_sqrt_ss( _mm_dp_ps( vec, vec, mask ) ) ); return result[ 0 ]; }
HW_FORCE_INLINE float dot(const Vec<N>& a, const Vec<N>& b) { __m128 x = _mm_mul_ps(a.xmm, b.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); float tmp; _mm_store_ss(&tmp, x); return tmp; }
F32 root(F32 val) { #ifdef USE_SSE2 __m128 i = _mm_set_ss(val); i = _mm_sqrt_ss(i); _mm_store_ss(&val, i); return val; #else return (F32)core_sqrt((F64)val); #endif }
// use MMX/SSE extensions void dotprod_rrrf_execute_mmx(dotprod_rrrf _q, float * _x, float * _y) { // first cut: ... __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (_q->n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&_x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute multiplication s = _mm_mul_ps(v, h); // parallel addition sum = _mm_add_ps( sum, s ); } // aligned output array float w[4] __attribute__((aligned(16))); #if HAVE_PMMINTRIN_H // fold down into single value __m128 z = _mm_setzero_ps(); sum = _mm_hadd_ps(sum, z); sum = _mm_hadd_ps(sum, z); // unload single (lower value) _mm_store_ss(w, sum); float total = w[0]; #else // unload packed array _mm_store_ps(w, sum); float total = w[0] + w[1] + w[2] + w[3]; #endif // cleanup for (; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
// Update location by velocity, one time-step void update_coords(uint32_t i, float* x, float* y, float* z, float* vx, float* vy, float* vz) { __m128 vec, flo, out; vec = _mm_set_ss(vx[i]); flo = _mm_set_ss(x[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&x[i], out); vec = _mm_set_ss(vy[i]); flo = _mm_set_ss(y[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&y[i], out); vec = _mm_set_ss(vz[i]); flo = _mm_set_ss(z[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&z[i], out); }
HW_FORCE_INLINE float invLength(const Vec<N>& a) { __m128 x = _mm_mul_ps(a.xmm, a.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); x = _mm_rsqrt_ss(x); float tmp; _mm_store_ss(&tmp, x); return tmp; }
void R_LocalPointToGlobal( const float modelMatrix[16], const idVec3 &in, idVec3 &out ) { #if defined(MACOS_X) && defined(__i386__) __m128 m0, m1, m2, m3; __m128 in0, in1, in2; float i0,i1,i2; i0 = in[0]; i1 = in[1]; i2 = in[2]; m0 = _mm_loadu_ps(&modelMatrix[0]); m1 = _mm_loadu_ps(&modelMatrix[4]); m2 = _mm_loadu_ps(&modelMatrix[8]); m3 = _mm_loadu_ps(&modelMatrix[12]); in0 = _mm_load1_ps(&i0); in1 = _mm_load1_ps(&i1); in2 = _mm_load1_ps(&i2); m0 = _mm_mul_ps(m0, in0); m1 = _mm_mul_ps(m1, in1); m2 = _mm_mul_ps(m2, in2); m0 = _mm_add_ps(m0, m1); m0 = _mm_add_ps(m0, m2); m0 = _mm_add_ps(m0, m3); _mm_store_ss(&out[0], m0); m1 = (__m128) _mm_shuffle_epi32((__m128i)m0, 0x55); _mm_store_ss(&out[1], m1); m2 = _mm_movehl_ps(m2, m0); _mm_store_ss(&out[2], m2); #else out[0] = in[0] * modelMatrix[0] + in[1] * modelMatrix[4] + in[2] * modelMatrix[8] + modelMatrix[12]; out[1] = in[0] * modelMatrix[1] + in[1] * modelMatrix[5] + in[2] * modelMatrix[9] + modelMatrix[13]; out[2] = in[0] * modelMatrix[2] + in[1] * modelMatrix[6] + in[2] * modelMatrix[10] + modelMatrix[14]; #endif }
/* ==================== idMD5Mesh::CalculateBounds ==================== */ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const { __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; __m128 minZ = vector_float_posInfinity; __m128 maxX = vector_float_negInfinity; __m128 maxY = vector_float_negInfinity; __m128 maxZ = vector_float_negInfinity; for ( int i = 0; i < numMeshJoints; i++ ) { const idJointMat & joint = entJoints[meshJoints[i]]; __m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 ); __m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 ); __m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 ); minX = _mm_min_ps( minX, x ); minY = _mm_min_ps( minY, y ); minZ = _mm_min_ps( minZ, z ); maxX = _mm_max_ps( maxX, x ); maxY = _mm_max_ps( maxY, y ); maxZ = _mm_max_ps( maxZ, z ); } __m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 ); minX = _mm_sub_ps( minX, expand ); minY = _mm_sub_ps( minY, expand ); minZ = _mm_sub_ps( minZ, expand ); maxX = _mm_add_ps( maxX, expand ); maxY = _mm_add_ps( maxY, expand ); maxZ = _mm_add_ps( maxZ, expand ); _mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); }
static void sse2_test (void) { float a = 1.0f; float b = 2.0f; float c = 3.0f; float r; __m128 v = _mm_set_ps(a, b, c, 0); v = (__m128)_mm_srli_si128((__m128i)v, 4); _mm_store_ss(&r, v); if (r != 3.0f) abort (); }
// ~~~~~~~~~~~~~~~ Task2 void mulVectorSse(MATRIX_TYPE** matrix, MATRIX_TYPE* vector, MATRIX_TYPE* result, size_t size) { for (size_t i = 0; i < size; i++) { __m128 localSum = _mm_setzero_ps(); for (size_t j = 0; j < size; j += 4) { __m128 tempMatix = _mm_load_ps(&matrix[i][j]); __m128 tempVector = _mm_load_ps(&vector[j]); localSum = _mm_add_ps(localSum, _mm_mul_ps(tempMatix, tempVector)); } localSum = _mm_hadd_ps(localSum, localSum); localSum = _mm_hadd_ps(localSum, localSum); _mm_store_ss(&result[i], localSum); } }
void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input = _mm_loadu_ps(input); input += 4; sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); _mm_store_ss(output, sum3); }
float reduction_sum_sse(float *v, int n) { int i; float sum; __m128 *v4 = (__m128 *)v; __m128 vsum = _mm_set1_ps(0.0f); for (i = 0; i < n / 4; i++) vsum = _mm_add_ps(vsum, v4[i]); vsum = _mm_hadd_ps(vsum, vsum); vsum = _mm_hadd_ps(vsum, vsum); _mm_store_ss(&sum, vsum); return sum; }