int main( int argc, char **argv ) { /* set A = |1 3|, B = |3 0| C = |0 0| |2 4| |0 2| |0 0| */ double A[4] = {1,2,3,4}, B[4] = {3,0,0,2}, C[4] = {0,0,0,0}; /* We are computing C = C + A x B, which means: C[0] += A[0]*B[0] + A[2]*B[1] C[1] += A[1]*B[0] + A[3]*B[1] C[2] += A[0]*B[2] + A[2]*B[3] C[3] += A[1]*B[2] + A[3]*B[3] */ /* load entire matrix C into SIMD variables */ __m128d c1 = _mm_loadu_pd( C+0 ); /* c1 = (C[0],C[1]) */ __m128d c2 = _mm_loadu_pd( C+2 ); /* c2 = (C[2],C[3]) */ for( int i = 0; i < 2; i++ ) { __m128d a = _mm_loadu_pd( A+i*2 ); /* load next column of A */ __m128d b1 = _mm_load1_pd( B+0+i ); __m128d b2 = _mm_load1_pd( B+2+i ); /* load next row of B */ c1 = _mm_add_pd( c1, _mm_mul_pd( a, b1 ) ); /* multiply and add */ c2 = _mm_add_pd( c2, _mm_mul_pd( a, b2 ) ); } /* store the result back to the C array */ _mm_storeu_pd( C+0, c1 ); /* (C[0],C[1]) = c1 */ _mm_storeu_pd( C+2, c2 ); /* (C[2],C[3]) = c2 */ /* output whatever we've got */ printf( "|%g %g| * |%g %g| = |%g %g|\n", A[0], A[2], B[0], B[2], C[0], C[2] ); printf( "|%g %g| |%g %g| |%g %g|\n", A[1], A[3], B[1], B[3], C[1], C[3] ); return 0; }
static void scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n) { __m128d xmm1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ * *val; } xmm1 = _mm_load_pd1(val); for (; n >= 4; n -= 4) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); xmm0 = _mm_loadu_pd(src1 + 2); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest + 2, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { *dest++ = *src1++ * *val; } }
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
// *p += (*s) * v COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s, size_t n, double v) { #if defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) += (*s++) * v; n--; } case 0x00: for (; n >= 2; n -= 2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } break; default: for (; n >= 2; n-=2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } } #endif for (; n > 0; n--) (*p++) += (*s++) * v; return p; }
void sgemm( int m, int n, float *A, float *C ) { int i, j, k, jtn, cieling; float B[n * m]; float buf[2]; __m128d sum, ab, cd, ef, AB, CD, EF; transpose(m, n, A, B); for (i = 0; i < m; i += 1) { for (j = 0; j < m; j += 1) { jtn = j * n; for (k = 0, cieling = n - 5; k < cieling; k += 6) { ab = _mm_load1_pd(A + i + k * m); cd = _mm_load1_pd(A + i + (k + 2) * m); ef = _mm_load1_pd(A + i + (k + 4) * m); AB = _mm_loadu_pd(B + k + jtn); CD = _mm_loadu_pd(B + k + 2 + jtn); EF = _mm_loadu_pd(B + k + 4 + jtn); sum = _mm_add_pd(sum, _mm_mul_sd(ab, AB)); sum = _mm_add_pd(sum, _mm_mul_sd(cd, CD)); sum = _mm_add_pd(sum, _mm_mul_sd(ef, EF)); } _mm_storeu_pd(buf, sum); C[i + j * m] = buf[0]; if (n % 6 != 0) { for ( ; k < n; k += 1) { C[i + j * m] += A[i + k * m] * A[k + jtn]; } } } } }
static void filterButter(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel) { #ifdef HAVE_SSE2 __m128d __kernel, __result, __temp; __declspec(align(16)) Float_t __temp2[2]; while (nSamples--) { __kernel = _mm_loadr_pd(&kernel[0]); __temp = _mm_loadu_pd(&input[-1]); __result = _mm_mul_pd(__temp, __kernel); __kernel = _mm_loadr_pd(&kernel[4]); __temp = _mm_loadu_pd(&output[-2]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); _mm_store_pd(__temp2, __result); *output = __temp2[0] + __temp2[1] + input [-2] * kernel[2]; ; ++output; ++input; } #else while (nSamples--) { *output = input [0] * kernel[0] - output[-1] * kernel[1] + input [-1] * kernel[2] - output[-2] * kernel[3] + input [-2] * kernel[4]; ++output; ++input; } #endif }
/* use compiler intrinsics for 2x parallel processing */ static inline double chi2_intrinsic_double(int n, const double* x, const double* y) { double result=0; const __m128d eps = _mm_set1_pd(DBL_MIN); const __m128d zero = _mm_setzero_pd(); __m128d chi2 = _mm_setzero_pd(); for ( ; n>1; n-=2) { const __m128d a = _mm_loadu_pd(x); const __m128d b = _mm_loadu_pd(y); x+=2; y+=2; const __m128d a_plus_b = _mm_add_pd(a,b); const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps); const __m128d a_minus_b = _mm_sub_pd(a,b); const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b); const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_pd(chi2, quotient); } const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1)); const __m128d sum = _mm_add_pd(chi2, shuffle); // with SSE3, we could use hadd_pd, but the difference is negligible _mm_store_sd(&result,sum); _mm_empty(); if (n) result += chi2_baseline_double(n, x, y); // remaining entries return result; }
static inline void inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; __m128d sum[2], t; const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride) }; sum[0] = sum[1] = _mm_setzero_pd (); for (; i < len; i += 4) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); t = _mm_loadu_pd (a + i + 2); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); } sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
double vector_ps_double (const double* pa,const double* pb,size_t n) { size_t k; /* multiplication 4 par 4 */ size_t q = n / 4; size_t r = n % 4; double w; _mm_prefetch (pa,_MM_HINT_NTA); _mm_prefetch (pb,_MM_HINT_NTA); if (q > 0) { __m128d acc1 = _mm_setzero_pd(); __m128d acc2 = _mm_setzero_pd(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_load_pd(pa); __m128d j1 = _mm_load_pd(pb); __m128d i2 = _mm_load_pd(pa+2); __m128d j2 = _mm_load_pd(pb+2); /* incrément de 4 doubles en tout (2 pour i et 2 pour j) */ /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 2 doubles dans chaque tableau */ __m128d i1 = _mm_loadu_pd(pa); __m128d j1 = _mm_loadu_pd(pb); __m128d i2 = _mm_loadu_pd(pa+2); __m128d j2 = _mm_loadu_pd(pb+2); /* Multiplie */ __m128d s1 = _mm_mul_pd(i1,j1); __m128d s2 = _mm_mul_pd(i2,j2); pa += 4; pb += 4; /* Accumule */ acc1 = _mm_add_pd(acc1,s1); acc2 = _mm_add_pd(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_pd(acc1,acc2); acc1 = _mm_hadd_pd(acc1,acc1); _mm_store_sd(&w,acc1); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
value complex_add(value vx, value vy) { CAMLparam2(vx, vy); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); _mm_storeu_pd((double*) vz, _mm_loadu_pd((double const*) vx) + _mm_loadu_pd((double const*) vy)); CAMLreturn(vz); }
void mlib_FIR_tap2f_d64s( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 src1_1, src2_1; mlib_d64 src1_2, src2_2; mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1]; __m128d sdflt1, sdflt2, ssrc1, ssrc2, smul1, smul2; sdflt2 = _mm_set1_pd(dflt2); sdflt1 = _mm_set1_pd(dflt1); if ((mlib_addr)psrc & 15) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_loadu_pd(psrc); ssrc2 = _mm_loadu_pd(psrc + 2); smul1 = _mm_mul_pd(sdflt2, ssrc1); smul2 = _mm_mul_pd(sdflt1, ssrc2); smul1 = _mm_add_pd(smul1, smul2); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_load_pd(psrc); ssrc2 = _mm_load_pd(psrc + 2); smul1 = _mm_mul_pd(sdflt2, ssrc1); smul2 = _mm_mul_pd(sdflt1, ssrc2); smul1 = _mm_add_pd(smul1, smul2); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } }
static void sse3_test_haddpd (double *i1, double *i2, double *r) { __m128d t1 = _mm_loadu_pd (i1); __m128d t2 = _mm_loadu_pd (i2); t1 = _mm_hadd_pd (t1, t2); _mm_storeu_pd (r, t1); }
/** this fun use the SSE to implement the mul **/ void square_dgemm(int lda, double* A, double* B, double* C) { // define the variable here register __m128d cTmp, aTmp, bTmp; for (int j = 0; j < lda; j++) { for (int k = 0; k < lda; k++) { // copy the B's val to fill the bTmp bTmp = _mm_load1_pd(B + k + j*lda); double* adda_mid = A + k*lda; double* addc_mid = C + j*lda; for (int i = 0; i < lda/8*8; i += 8) { double* adda = adda_mid + i; double* addc = addc_mid + i; aTmp = _mm_loadu_pd(adda); cTmp = _mm_loadu_pd(addc); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd(addc, cTmp); aTmp = _mm_loadu_pd(adda + 2); cTmp = _mm_loadu_pd(addc + 2); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 2), cTmp); aTmp = _mm_loadu_pd(adda + 4); cTmp = _mm_loadu_pd(addc + 4); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 4), cTmp); aTmp = _mm_loadu_pd(adda + 6); cTmp = _mm_loadu_pd(addc + 6); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 6), cTmp); } for (int i = lda/8*8; i < lda/2*2; i += 2) { double* adda = adda_mid + i; double* addc = addc_mid + i; aTmp = _mm_loadu_pd(adda); cTmp = _mm_loadu_pd(addc); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd(addc, cTmp); } // the last case for (int i = lda/2*2; i < lda; i ++) { C[i + j*lda] += A[i + k*lda] * B[k+j*lda]; } } } }
static void sse3_test_movddup_reg (double *i1, double *r) { __m128d t1 = _mm_loadu_pd (i1); __m128d t2 = _mm_loadu_pd (&cnst1[0]); t1 = _mm_mul_pd (t1, t2); t2 = _mm_movedup_pd (t1); _mm_storeu_pd (r, t2); }
value complex_mul(value vab, value vcd) { CAMLparam2(vab, vcd); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); __m128d ab, cd, ac_bd, ba, bc_ad; ab = _mm_loadu_pd((double const*) vab); cd = _mm_loadu_pd((double const*) vcd); ac_bd = _mm_mul_pd(ab, cd); ba = _mm_shuffle_pd(ab, ab, 1); bc_ad = _mm_mul_pd(ba, cd); _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad)); CAMLreturn(vz); }
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound; int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))]; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); itt_bound = (N1/tilesize)*tilesize; for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) { jtt_bound =(N2/tilesize)*tilesize; for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) { it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize; for (it = itt; it < it_bound; it = it+tilesize) { jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize; for (jt = jtt; jt < jt_bound; jt = jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; pB = b+j*N1+i; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) { for (j = jtt_bound; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } } for (i = itt_bound; i < N1; i++) { for (j = 0; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } }
static double* copy_block(int lda, int M, int N, double* A, double* new_A) { int M_even = turn_even(M); int N_even = turn_even(N); int i_step; __m128d a; for (int j=0; j<N; j++) { for (int i=0; i<M; i+=I_STRIDE) { i_step = min(I_STRIDE, M-i); if (i_step==1) { new_A[i+j*M_even] = A[i+j*lda]; } else { a = _mm_loadu_pd(A+i+j*lda); _mm_store_pd(new_A+i+j*M_even, a); } } } if (N % 2) { for (int i=0; i<M_even; i++) { new_A[i+(N_even-1)*M_even] = 0.0; } } return new_A; }
static void TEST (void) { union128d s1; union128 u, s2; double source1[2] = {123.345, 67.3321}; float e[4] = {5633.098, 93.21, 3.34, 4555.2}; s1.x = _mm_loadu_pd (source1); s2.x = _mm_loadu_ps (e); __asm("" : "+v"(s1.x), "+v"(s2.x)); u.x = test(s2.x, s1.x); e[0] = (float)source1[0]; if (check_union128(u, e)) #if DEBUG { printf ("sse2_test_cvtsd2ss_1; check_union128 failed\n"); printf ("\t [%f,%f,%f,%f],[%f,%f]\n", s2.a[0], s2.a[1], s2.a[2], s2.a[3], s1.a[0], s1.a[1]); printf ("\t -> \t[%f,%f,%f,%f]\n", u.a[0], u.a[1], u.a[2], u.a[3]); printf ("\texpect\t[%f,%f,%f,%f]\n", e[0], e[1], e[2], e[3]); } #else abort (); #endif }
static void clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; double max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_pd(max); for (; n >= 2; n -= 2) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_min_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; } for (; n > 0; n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } }
void trigo_vsin_vml_sse2(double* dst, const double* src, size_t length) { size_t i = length; while (i) { if (!SimdUtils::isAligned(dst, 16) || i == 1) { __m128d d = _mm_load_sd(src); _mm_store_sd(dst, sin_vml_pd(d)); dst++; src++; if (--i == 0) break; } while (i >= 2) { __m128d d = _mm_loadu_pd(src); _mm_store_pd(dst, sin_vml_pd(d)); dst += 2; src += 2; i -= 2; } } }
__m128d test_mm_loadu_pd(double const* A) { // DAG-LABEL: test_mm_loadu_pd // DAG: load <2 x double>, <2 x double>* %{{.*}}, align 1 // // ASM-LABEL: test_mm_loadu_pd // ASM: movupd return _mm_loadu_pd(A); }
static void sse3_test_movddup_reg_subsume_unaligned (double *i1, double *r) { __m128d t1 = _mm_loadu_pd (i1); __m128d t2 = _mm_movedup_pd (t1); _mm_storeu_pd (r, t2); }
/* This function malloc new aligned memory space for matrix and * then copy the original values into it. The new matrix's size is * a multiple of 8, which makes it easier to handle the boundry. * The new matrix's layout is like this: * [[C O], * [O O]] * */ double* matrix_padding(double* old_matrix, int old_size, int new_size){ double* new_matrix; /* Allocate aligned space according to the new size*/ posix_memalign((void**)&new_matrix, 16, sizeof(double)*new_size*new_size); /* Copy data. * Handle odd/even old size sepatately to avoid if-branches in * any loops. */ if(old_size%2 == 1) { for(int i=0; i<old_size; i++) { for(int j=0; j<old_size - 1; j+=2) { __m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j); _mm_store_pd(new_matrix + i*new_size + j, v1); } new_matrix[i*new_size+old_size-1]=old_matrix[(i+1)*old_size-1]; for(int j=old_size; j<new_size; j++) { new_matrix[i*new_size + j] = 0; } } }else { for(int i=0; i<old_size; i++) { for(int j=0; j<old_size; j+=2) { __m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j); _mm_store_pd(new_matrix + i*new_size + j, v1); } for(int j=old_size; j<new_size; j++) { new_matrix[i*new_size + j] = 0; } } } /* Set extra space with ZERO. */ __m128d v_zero = _mm_setzero_pd(); for(int i=old_size; i<new_size; i++) { double* addr = new_matrix + i * new_size; for(int j=0; j<new_size; j+=10) { _mm_store_pd(addr+j, v_zero); _mm_store_pd(addr+j+2, v_zero); _mm_store_pd(addr+j+4, v_zero); _mm_store_pd(addr+j+6, v_zero); _mm_store_pd(addr+j+8, v_zero); } } return new_matrix; }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m128d l1 = _mm_load1_pd(&lambda); size_t q = n / 2; size_t r = n % 2; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 2 valeurs de chaque tableau */ __m128d i1 = _mm_load_pd(v1); __m128d j1 = _mm_load_pd(v2); /* multiplie */ j1 = _mm_mul_pd(j1, l1); /* additionne */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_store_pd(v1, i1); v1 += 2; v2 += 2; } } else { for (k=0;k<q;k++) { /* Charge 8 valeurs de chaque tableau */ __m128d i1 = _mm_loadu_pd(v1); __m128d j1 = _mm_loadu_pd(v2); j1 = _mm_mul_pd(j1, l1); /* Soustrait */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_storeu_pd(v1, i1); v1 += 2; v2 += 2; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
SSE_FUNCTION static void add_f64_sse2 (double *dest, double *src1, double *src2, int n) { __m128d xmm0, xmm1; while (((long)dest & 15) && (0 < n)) { *dest++ = *src1++ + *src2++; n--; } while (1 < n) { xmm0 = _mm_loadu_pd(src1); xmm1 = _mm_loadu_pd(src2); xmm0 = _mm_add_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; src2 += 2; n -= 2; } while (0 < n) { *dest++ = *src1++ + *src2++; n--; } }
static inline void inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i; __m128d f[2], sum[4], t; const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride), (gdouble *) ((gint8 *) b + 2 * bstride), (gdouble *) ((gint8 *) b + 3 * bstride) }; f[0] = _mm_loadu_pd (icoeff + 0); f[1] = _mm_loadu_pd (icoeff + 2); sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); for (i = 0; i < len; i += 2) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); } sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[2] = _mm_add_pd (sum[2], sum[3]); sum[0] = _mm_add_pd (sum[0], sum[2]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
static inline void inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; __m128d sum = _mm_setzero_pd (); for (; i < len; i += 8) { sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), _mm_load_pd (b + i + 0))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), _mm_load_pd (b + i + 2))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), _mm_load_pd (b + i + 4))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), _mm_load_pd (b + i + 6))); } sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); _mm_store_sd (o, sum); }
void computeDensitySSE(const double * const currentCell, double *density) { __m128d vsum = _mm_set1_pd(0.0); int i; for (i = 0; i < PARAMQ - 1; i += 2) { __m128d v = _mm_loadu_pd(¤tCell[i]); vsum = _mm_add_pd(vsum, v); } vsum = _mm_hadd_pd(vsum, vsum); _mm_storeh_pd(density, vsum); if (i < PARAMQ) { *density += currentCell[i]; } }
static void TEST (void) { union128d s1; union128 u, s2; double source1[2] = {123.345, 67.3321}; float e[4] = {5633.098, 93.21, 3.34, 4555.2}; s1.x = _mm_loadu_pd (source1); s2.x = _mm_loadu_ps (e); u.x = test(s2.x, s1.x); e[0] = (float)source1[0]; if (check_union128(u, e)) abort (); }
void computeVelocitySSE(const double * const currentCell, const double * const density, double *velocity) { __m128d v0, v1, v2; int i; v0 = v1 = v2 = _mm_setzero_pd(); for (i = 0; i < PARAMQ - 1; i += 2) { __m128d vc, vl0, vl1, vl2; __m128i vtemp; vc = _mm_loadu_pd(¤tCell[i]); vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[0][i]); vl0 = _mm_cvtepi32_pd(vtemp); vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[1][i]); vl1 = _mm_cvtepi32_pd(vtemp); vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[2][i]); vl2 = _mm_cvtepi32_pd(vtemp); v0 = _mm_add_pd(v0, _mm_mul_pd(vc, vl0)); v1 = _mm_add_pd(v1, _mm_mul_pd(vc, vl1)); v2 = _mm_add_pd(v2, _mm_mul_pd(vc, vl2)); } v0 = _mm_hadd_pd(v0, v0); v1 = _mm_hadd_pd(v1, v1); v2 = _mm_hadd_pd(v2, v2); _mm_store_sd (&velocity[0], v0); _mm_store_sd (&velocity[1], v1); _mm_store_sd (&velocity[2], v2); if (i < PARAMQ) { velocity[0] += currentCell[i] * LATTICEVELOCITIES2[0][i]; velocity[1] += currentCell[i] * LATTICEVELOCITIES2[1][i]; velocity[2] += currentCell[i] * LATTICEVELOCITIES2[2][i]; } velocity[0] = velocity[0] / (*density); velocity[1] = velocity[1] / (*density); velocity[2] = velocity[2] / (*density); }