inline void GDALCopyWordSSE(const float fValueIn, Tout &tValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_set_ss(fValueIn); __m128 xmm_min = _mm_set_ss(fMinVal); __m128 xmm_max = _mm_set_ss(fMaxVal); xmm = _mm_min_ss(_mm_max_ss(xmm, xmm_min), xmm_max); #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128 p0d5 = _mm_set_ss(0.5f); if (std::numeric_limits<Tout>::is_signed) { __m128 mask = _mm_cmpge_ss(xmm, _mm_set_ss(0.f)); __m128 m0d5 = _mm_set_ss(-0.5f); xmm = _mm_add_ss(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); } else { xmm = _mm_add_ss(xmm, p0d5); } #endif #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE tValueOut = (Tout)_mm_cvttss_si32(xmm); #else tValueOut = (Tout)_mm_cvtss_si32(xmm); #endif }
int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max) { const __m128 min = _mm_load_ps(box_min); const __m128 max = _mm_load_ps(box_max); for (int i = 0; i < 6; i++) { const __m128 plane = _mm_load_ps(frustum + 4 * i); const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps()); const __m128 n = _mm_or_ps(_mm_and_ps(mask, max), _mm_andnot_ps(mask, min)); const __m128 d = _mm_mul_ps(n, plane); const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3)); const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2)); const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1)); const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2)); const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps()); float reti; _mm_store_ss(&reti, ret); if (reti != 0) return 0; } return 1; }
void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; __m128 xsum1, xsum2; xsum1 = _mm_setzero_ps(); xsum2 = _mm_setzero_ps(); for (i=0;i<N-3;i+=4) { __m128 xi = _mm_loadu_ps(x+i); __m128 y1i = _mm_loadu_ps(y01+i); __m128 y2i = _mm_loadu_ps(y02+i); xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); } /* Horizontal sum */ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); _mm_store_ss(xy1, xsum1); xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); _mm_store_ss(xy2, xsum2); for (;i<N;i++) { *xy1 = MAC16_16(*xy1, x[i], y01[i]); *xy2 = MAC16_16(*xy2, x[i], y02[i]); } }
inline __m128 SSENormalizeMultiplierSSE2(__m128 v) { const __m128 sq = _mm_mul_ps(v, v); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); const __m128 rt = _mm_rsqrt_ss(res); return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); }
/* Combines unpack and accumulate */ void vector_accumulate_8bit(float *out, const char *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; float ftmp; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { // Cast these without intrinsics ftmp = (float)(*in); out_ = _mm_load_ss(out); in_ = _mm_load_ss(&ftmp); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += (float)in[i]; } #endif }
int drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments) { int32_t i; float d; moments_t onlinemoments; __m128 x, y, r, r2, s; moments_clear(&onlinemoments); x = load_float3(&coords[3 * index]); for (i = 0; i < n_partners; i++) { y = load_float3(&coords[3 * partners[i]]); r = _mm_sub_ps(x, y); /* x - y */ r2 = _mm_mul_ps(r, r); /* (x - y)**2 */ /* horizontal add the components of d2 with */ /* two instructions. note: it's critical */ /* here that the last entry of x1 and x2 was 0 */ /* so that d2.w = 0 */ s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2)); s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1)); /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not accurate to pass the tests */ _mm_store_ss(&d, s); moments_push(&onlinemoments, 1.0 / sqrt((double) d)); } moments[0] = moments_mean(&onlinemoments); moments[1] = sqrt(moments_second(&onlinemoments)); moments[2] = cbrt(moments_third(&onlinemoments)); return 1; }
_inline float process_folded_fir_sse2(const float *fir_kernel, const float *queue_head, const float *queue_tail, int len) { __m128 acc = _mm_set_ps(0, 0, 0, 0); queue_tail -= 3; len >>= 2; while (len > 0) { __m128 head = _mm_loadu_ps(queue_head); __m128 tail = _mm_loadu_ps(queue_tail); __m128 kern = _mm_load_ps(fir_kernel); tail = _mm_shuffle_ps(tail, tail, 0x1b); // swap the order __m128 t1 = _mm_add_ps(tail, head); // add the head t1 = _mm_mul_ps(t1, kern); // mul acc = _mm_add_ps(acc, t1); // add queue_head += 4; queue_tail -= 4; fir_kernel += 4; len--; } // horizontal sum const __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); const __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); return sum.m128_f32[0]; }
__m128 COMPUTE () { __m128 v0 = _mm_setzero_ps(); __m128 v1 = _mm_set1_ps(1.0); __m128 v2 = _mm_set1_ps(1.0);; __m128 v3 = _mm_sub_ss (v1, v2); return _mm_add_ss (v3, v0); }
void vector_accumulate(float *out, const float *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { in_ = _mm_load_ss(in); out_ = _mm_load_ss(out); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += in[i]; } #endif }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
static inline __m128 horizontal_add(const __m128 a) { #if 0 //!! needs SSE3 const __m128 ftemp = _mm_hadd_ps(a, a); return _mm_hadd_ps(ftemp, ftemp); #else const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3 return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3) #endif }
//Thanks stack overflow. static inline float _mm256_reduce_add_ps(__m256 x) { /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */ const int imm = 1; const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x)); /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */ const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */ const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); /* Conversion to float is a no-op on x86-64 */ return _mm_cvtss_f32(x32); }
// Update location by velocity, one time-step void update_coords(uint32_t i, float* x, float* y, float* z, float* vx, float* vy, float* vz) { __m128 vec, flo, out; vec = _mm_set_ss(vx[i]); flo = _mm_set_ss(x[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&x[i], out); vec = _mm_set_ss(vy[i]); flo = _mm_set_ss(y[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&y[i], out); vec = _mm_set_ss(vz[i]); flo = _mm_set_ss(z[i]); out = _mm_add_ss(vec, flo); _mm_store_ss(&z[i], out); }
inline float hadd(const vector4f& rhs) { #if SSE_INSTR_SET >= 3 // SSE3 __m128 tmp0 = _mm_hadd_ps(rhs, rhs); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); #else __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); #endif return _mm_cvtss_f32(tmp1); }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ static inline float horizontal_sum_avx2(__m256 x) { const __m128 hi_quad = _mm256_extractf128_ps(x, 1); const __m128 lo_quad = _mm256_castps256_ps128(x); const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad); const __m128 lo_dual = sum_quad; const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad); const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual); const __m128 lo = sum_dual; const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1); const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input = _mm_loadu_ps(input); input += 4; sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); _mm_store_ss(output, sum3); }
// Sums an array of floats; needed in replacement of Python sum() float sum(float* a, uint_fast32_t num_elements) { __m128 avec, sumflo, sumout; float* sum = _mm_malloc(sizeof(float), sizeof(int16_t)); sumflo = _mm_set_ss(*sum); for (uint_fast32_t i = 0; i < num_elements; i++) { avec = _mm_set_ss(a[i]); sumout = _mm_add_ss(avec, sumflo); _mm_store_ss(sum, sumout); } return *sum; }
float dot_product(const int N, const float *X, const int incX, const float *Y, const int incY) { __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) { __m256 xval = _mm256_load_ps(X); __m256 yval = _mm256_load_ps(Y); __m256 val = _mm256_mul_ps(xval, yval); accum = _mm256_add_ps(val, accum); } // Reduce the values in accum into the smallest 32-bit subsection // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3 __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum), _mm256_extractf128_ps(accum, 1)); // b0 b1 b2 b3 -> c0 c1 b2 b3 accum2 = _mm_add_ps(accum2, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8))); __m128 final_val = _mm_add_ss( _mm_insert_ps(accum2, accum2, 0x4e), accum2); // Add the high and low halves return final_val[0]; }
static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b) { const int a_nd = ccv_nnc_tensor_nd(a->info.dim); const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1; const int b_nd = ccv_nnc_tensor_nd(b->info.dim); const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1; assert(bdim[0] == bias->info.dim[0]); assert(bdim[0] == w->info.dim[0]); assert(adim[0] == w->info.dim[1]); const int* ainc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc : a->inc + 1) : adim; const int* binc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc : b->inc + 1) : bdim; const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim; const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]); int i; for (i = 0; i < batch_size; i++) { const float* const ap = a->data.f32 + i * ainc[0]; float* const bp = b->data.f32 + i * binc[0]; parallel_for(j, bdim[0]) { const float* const wp = w->data.f32 + j * winc[1]; int k; __m128 v40 = _mm_set_ss(bias->data.f32[j]); __m128 v41 = _mm_setzero_ps(); for (k = 0; k < adim[0] - 7; k += 8) { __m128 ap40 = _mm_load_ps(ap + k); __m128 ap41 = _mm_load_ps(ap + k + 4); __m128 w40 = _mm_load_ps(wp + k); __m128 w41 = _mm_load_ps(wp + k + 4); v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40); v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41); } v40 = _mm_add_ps(v40, v41); v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40)); v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1)); _mm_store_ss(bp + j, v40); } parallel_endfor } return CCV_NNC_EXEC_SUCCESS; }
static REAL dotp_sse(REAL a[], REAL b[]) { #ifdef __SSE__ /* This is taken from speex's inner product implementation */ int j; REAL sum; __m128 acc = _mm_setzero_ps(); for (j=0;j<NLMS_LEN;j+=8) { acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j))); acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4))); } acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55)); _mm_store_ss(&sum, acc); return sum; #else return dotp(a, b); #endif }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ inline float sum8(__m256 x) { // hiQuad = ( x7, x6, x5, x4 ) const __m128 hiQuad = _mm256_extractf128_ps(x, 1); // loQuad = ( x3, x2, x1, x0 ) const __m128 loQuad = _mm256_castps256_ps128(x); // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 ) const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad); // loDual = ( -, -, x1 + x5, x0 + x4 ) const __m128 loDual = sumQuad; // hiDual = ( -, -, x3 + x7, x2 + x6 ) const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad); // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 ) const __m128 sumDual = _mm_add_ps(loDual, hiDual); // lo = ( -, -, -, x0 + x2 + x4 + x6 ) const __m128 lo = sumDual; // hi = ( -, -, -, x1 + x3 + x5 + x7 ) const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1); // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ) const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
double CMercerKernel<float>::Evaluate(float* x, float* y) { #ifndef __SSE4_1__ float result = 0; for(size_t i=0; i<m_n; i++) result += x[i]*y[i]; return static_cast<double>(result); #else __m128* px = reinterpret_cast<__m128*>(x); __m128* py = reinterpret_cast<__m128*>(y); float zero = 0.0; __m128 sum = _mm_load1_ps(&zero); const int mask = 241; // 4 MSB mask input, 4 LSB mask output for(size_t i=0; i<m_offset/4; i++) { __m128 temp = _mm_dp_ps(px[i],py[i],mask); sum = _mm_add_ss(sum,temp); // accumulate result in first register } float result[4] = {0.0,0.0,0.0,0.0}; _mm_storeu_ps(result,sum); // add offset for(size_t i=m_offset; i<m_n; i++) result[0] += x[i]*y[i]; return static_cast<double>(result[0]); #endif }
SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n) { int i; float z; __m128 n1; __m128 n2; __m128 n3; __m128 n4; z = 0.0f; if ((i = n & ~3)) { n4 = _mm_setzero_ps(); //sets sum to zero for (i -= 4; i >= 0; i -= 4) { n1 = _mm_loadu_ps(x + i); n2 = _mm_loadu_ps(y + i); n3 = _mm_mul_ps(n1, n2); n4 = _mm_add_ps(n4, n3); } n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4); n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4); _mm_store_ss(&z, n4); } /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */ switch (n & 3) { case 3: z += x[n - 3]*y[n - 3]; case 2: z += x[n - 2]*y[n - 2]; case 1: z += x[n - 1]*y[n - 1]; } return z; }
opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, int N) { int i; float xy; __m128 sum; sum = _mm_setzero_ps(); /* FIXME: We should probably go 8-way and use 2 sums. */ for (i=0;i<N-3;i+=4) { __m128 xi = _mm_loadu_ps(x+i); __m128 yi = _mm_loadu_ps(y+i); sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); } /* Horizontal sum */ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); _mm_store_ss(&xy, sum); for (;i<N;i++) { xy = MAC16_16(xy, x[i], y[i]); } return xy; }
__m128 __attribute__((__target__("sse"))) mm_add_ss_wrap(__m128 a, __m128 b) { return _mm_add_ss(a, b); }
ibMtx4& ibMtx4::Invert() { f32* src = &data.a[0][0]; __m128 minor0, minor1, minor2, minor3; __m128 row0, row1, row2, row3; __m128 det, tmp1; #if !defined NDEBUG || defined STATIC // Suppress RTC error for uninit vars f32 init = 0.f; row3 = row1 = tmp1 = _mm_load_ps1( &init ); #endif // NDEBUG tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); return *this; }
phash phash_for_pixmap(const QPixmap& pixmap) { static bool cos_table_initialized = false; ALIGN(16, static float cos_table[8][8][32][32]); ALIGN(16, float intensity[32][32]); if(!cos_table_initialized) { cos_table_initialized = true; // 32x32 DCT, though we are only interested in the top left 8x8, representing lowest frequencies in the image for(int u = 0; u < 8; u++) { for(int v = 0; v < 8; v++) { for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x++) { cos_table[v][u][y][x] = cosf(M_PI / 32.0f * (x + 0.5f) * u) * cosf(M_PI / 32.0f * (y + 0.5f) * v); } } } } } // Scale down to 32x32 QImage image = pixmap.scaled(32, 32, Qt::IgnoreAspectRatio, Qt::SmoothTransformation).toImage(); float dct[64]; int counter = 0; // Convert to grayscale const __m128 luminance = _mm_set_ps(.0f, 0.2126f, 0.7152f, 0.0722f); for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x++) { QRgb pixel = image.pixel(x, y); __m128 p = _mm_set_ps(0, qRed(pixel), qGreen(pixel), qBlue(pixel)); __m128 v = _mm_mul_ps(luminance, p); __m128 t = _mm_add_ps(v, _mm_movehl_ps(v, v)); __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); _mm_store_ss(&intensity[y][x], sum); } } // DCT for(int u = 0; u < 8; u++) { for(int v = 0; v < 8; v++) { __m128 acc = _mm_setzero_ps(); for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x+=4) { __m128 in = _mm_load_ps(&intensity[y][x]); __m128 cos = _mm_load_ps(&cos_table[v][u][y][x]); __m128 out = _mm_mul_ps(in, cos); acc = _mm_add_ps(out, acc); } } __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); _mm_store_ss(&dct[counter++], sum); } } // Mean, skip first one float mean = 0.0; for(int i = 1; i < 64; i++) { mean += dct[i]; } mean /= 63; // Calculate the final hash phash hash = 0; for(int i = 0; i < 64; i++) { phash val = dct[i] > mean; hash |= val << i; } return hash; }
/*! * \brief Perform an horizontal sum of the given vector. * \param in The input vector type * \return the horizontal sum of the vector */ ETL_STATIC_INLINE(float) hadd(avx_simd_float in) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); }
void kernel_sgemv_t_1_lib4(int kmax, int kna, float *A, int sda, float *x, float *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; int ka = kmax-kna; // number from aligned positon __m128 a_00_10_20_30, x_0_1_2_3, y_0, y_1; y_0 = _mm_setzero_ps(); k = 0; if(kna>0) { for(; k<kna; k++) { x_0_1_2_3 = _mm_load_ss( &x[0] ); a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] ); /* y_0 += a_00_10_20_30 * x_0_1_2_3;*/ a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 ); y_0 = _mm_add_ss( y_0, a_00_10_20_30 ); x += 1; A += 1; } A += (sda-1)*lda; } k = 0; for(; k<ka-3; k+=4) { x_0_1_2_3 = _mm_loadu_ps( &x[0] ); a_00_10_20_30 = _mm_load_ps( &A[0+lda*0] ); /* y_0 += a_00_10_20_30 * x_0_1_2_3;*/ a_00_10_20_30 = _mm_mul_ps( a_00_10_20_30, x_0_1_2_3 ); y_0 = _mm_add_ps( y_0, a_00_10_20_30 ); x += 4; A += 4; A += (sda-1)*lda; } for(; k<ka; k++) { x_0_1_2_3 = _mm_load_ss( &x[0] ); a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] ); /* y_0 += a_00_10_20_30 * x_0_1_2_3;*/ a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 ); y_0 = _mm_add_ss( y_0, a_00_10_20_30 ); x += 1; A += 1; } __m128 y_0_1_2_3; y_1 = _mm_setzero_ps(); y_0 = _mm_hadd_ps(y_0, y_1); y_0 = _mm_hadd_ps(y_0, y_1); if(alg==0) { _mm_store_ss(&y[0], y_0); } else if(alg==1) { y_0_1_2_3 = _mm_load_ss( &y[0] ); y_0_1_2_3 = _mm_add_ss(y_0_1_2_3, y_0); _mm_store_ss(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm_load_ss( &y[0] ); y_0_1_2_3 = _mm_sub_ss(y_0_1_2_3, y_0); _mm_store_ss(&y[0], y_0_1_2_3); } }
// Does inverse according to Cramers Rule // See ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const { f32 *src = (f32*)&mat; __m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps(); __m128 row0=_mm_setzero_ps(), row1=_mm_setzero_ps(), row2=_mm_setzero_ps(), row3=_mm_setzero_ps(); __m128 det=_mm_setzero_ps(), tmp1=_mm_setzero_ps(); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); _mm_store_ss(&detv, det); Mat44 t; if(out) { src = (f32*)out->mat; } else { src = t.mat; } minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); };