static void scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n) { __m128d xmm1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ * *val; } xmm1 = _mm_load_pd1(val); for (; n >= 4; n -= 4) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); xmm0 = _mm_loadu_pd(src1 + 2); xmm0 = _mm_mul_pd(xmm0, xmm1); _mm_store_pd(dest + 2, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { *dest++ = *src1++ * *val; } }
void CAllPassFilterPair::processBlock(double* data, int numSamples) { jassert((((size_t) data) & 0xF) == 0); jassert((_mm_getcsr() & 0x8040) == 0x8040); __m128d coeff = _mm_load_pd(md.getPtr(0)); __m128d x1 = _mm_load_pd(md.getPtr(1)); __m128d x2 = _mm_load_pd(md.getPtr(2)); __m128d y1 = _mm_load_pd(md.getPtr(3)); __m128d y2 = _mm_load_pd(md.getPtr(4)); for (int i=0; i<numSamples; ++i) { __m128d x0 = _mm_load_pd(&(data[i+i])); __m128d tmp = _mm_sub_pd(x0, y2); tmp = _mm_mul_pd(tmp, coeff); __m128d y0 = _mm_add_pd(x2, tmp); _mm_store_pd(&(data[i+i]), y0); x2=x1; x1=x0; y2=y1; y1=y0; } _mm_store_pd(md.getPtr(1), x1); _mm_store_pd(md.getPtr(2), x2); _mm_store_pd(md.getPtr(3), y1); _mm_store_pd(md.getPtr(4), y2); };
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
static void * add_sse2_unroll(void *ptr) { size_t n; char *p1, *p2, *p3; struct narray3 *a = (struct narray3*)ptr; dtype x, y; size_t i, n4; ssize_t s1, s2, s3; __m128d d1, d2, d3; __m128d e1, e2, e3; char *q1, *q2, *q3; p1 = a->x->ptr; p2 = a->y->ptr; p3 = a->z->ptr; n = a->x->size; s1 = s2 = s3 = sizeof(dtype); q1 = p1 + s1*2; q2 = p2 + s2*2; q3 = p3 + s3*2; s1 = s2 = s3 = sizeof(dtype)*4; n4 = 3; n4 = (n & ~n4) - 2; e1 = _mm_load_pd((dtype*)q1); q1+=s1; e2 = _mm_load_pd((dtype*)q2); q2+=s2; e3 = _mm_add_pd(e1,e2); for (i=2; i<n4; i+=4) { d1 = _mm_load_pd((dtype*)p1); p1+=s1; d2 = _mm_load_pd((dtype*)p2); p2+=s2; d3 = _mm_add_pd(d1,d2); _mm_store_pd((dtype*)q3,e3); q3+=s3; e1 = _mm_load_pd((dtype*)q1); q1+=s1; e2 = _mm_load_pd((dtype*)q2); q2+=s2; e3 = _mm_add_pd(e1,e2); _mm_store_pd((dtype*)p3,d3); p3+=s3; } _mm_store_pd((dtype*)q3,e3); for (; i<n; i++) { x = *(dtype*)p1; p1+=s1; y = *(dtype*)p2; p2+=s2; x = x+y; *(dtype*)p3 = x; p3+=s3; } return 0; }
int fft4a_(double *a, double *b, double *w, int *l) { int j, j0, j1, j2, j3, j4, j5, j6, j7; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j << 3; j5 = j4 + 2; j6 = j5 + 2; j7 = j6 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); /* x0 = a[j0] + a[j2]; y0 = a[j0 + 1] + a[j2 + 1]; x1 = a[j0] - a[j2]; y1 = a[j0 + 1] - a[j2 + 1]; x2 = a[j1] + a[j3]; y2 = a[j1 + 1] + a[j3 + 1]; x3 = a[j1 + 1] - a[j3 + 1]; y3 = a[j3] - a[j1]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[j4] = x0 + x2; b[j4 + 1] = y0 + y2; b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[j4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3))); } return 0; }
/* This routine performs a dgemm operation * C := C + A * B * where A, B, and C are lda-by-lda matrices stored in row-major order * On exit, A and B maintain their input values. */ void square_dgemm (int lda, double* A, double* B, double* C, int block_size) { /* Do matrix padding first. */ int step_size = UNROLLING_SIZE * 2; int new_size = lda + step_size - lda % step_size; double* old_C = C; int old_size = lda; A = matrix_padding(A, lda, new_size); B = matrix_padding(B, lda, new_size); // We don't need to copy data from old C to new C, // So we handle it separately here. posix_memalign((void**)&C, 16, sizeof(double)*new_size*new_size); __m128d v_zero = _mm_setzero_pd(); for(int i=0; i<new_size*new_size; i+=10) { _mm_store_pd(C+i, v_zero); _mm_store_pd(C+i+2, v_zero); _mm_store_pd(C+i+4, v_zero); _mm_store_pd(C+i+6, v_zero); _mm_store_pd(C+i+8, v_zero); } lda = new_size; #ifdef TRANSPOSE for (int i = 0; i < lda; ++i) for (int j = 0; j < lda; ++j) { double t = B[i*lda+j]; B[i*lda+j] = B[j*lda+i]; B[j*lda+i] = t; } #endif /* For each L1-block-row of A */ for (int i = 0; i < lda; i += L2_BLOCK_SIZE) { int M = min (L2_BLOCK_SIZE, lda-i); /* For each L1-block-column of B */ for (int j = 0; j < lda; j += L2_BLOCK_SIZE) { int N = min (L2_BLOCK_SIZE, lda-j); /* Accumulate L1-block dgemms into block of C */ for (int k = 0; k < lda; k += L2_BLOCK_SIZE) { /* Correct block dimensions if block "goes off edge of" the matrix. */ int K = min (L2_BLOCK_SIZE, lda-k); /* Perform individual block dgemm */ do_l2_block(lda, M, N, K, A + i*lda + k, B + k*lda + j, C + i*lda + j); } } } // Copy computation result back to the original matrix copy_padding_back(old_size, old_C, lda, C); free(A); free(B); }
int fft3a_(double *a, double *b, double *w, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int j, j0, j1, j2, j3, j4, j5; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j * 6; j4 = j3 + 2; j5 = j4 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); /* x0 = a[j1] + a[j2]; y0 = a[j1 + 1] + a[j2 + 1]; x1 = a[j0] - c32 * x0; y1 = a[j0 + 1] - c32 * y0; x2 = c31 * (a[j1 + 1] - a[j2 + 1]); y2 = c31 * (a[j2] - a[j1]); */ t1 = _mm_load_pd(&a[j1]); t2 = _mm_load_pd(&a[j2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[j0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[j3] = a[j0] + x0; b[j3 + 1] = a[j0 + 1] + y0; b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[j3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2))); } return 0; }
void increment_sse41(float arr[4]) { double darr[4]; __m128d val1 = _mm_set_pd(arr[0], arr[1]); __m128d val2 = _mm_set_pd(arr[2], arr[3]); __m128d one = _mm_set_pd(1.0, 1.0); __m128d result = _mm_add_pd(val1, one); result = _mm_ceil_pd(result); /* A no-op, only here to use a SSE4.1 intrinsic. */ _mm_store_pd(darr, result); result = _mm_add_pd(val2, one); _mm_store_pd(&darr[2], result); arr[0] = (float)darr[1]; arr[1] = (float)darr[0]; arr[2] = (float)darr[3]; arr[3] = (float)darr[2]; }
void Matrix<double>::multiply(double value) { double* y = pData; int n = width*height; int i; __m128d XMM7 = _mm_set1_pd(value); for (i = 0;i < (n);i += 4) { __m128d XMM0 = _mm_load_pd((y)+i ); __m128d XMM1 = _mm_load_pd((y)+i+2); XMM0 = _mm_mul_pd(XMM0, XMM7); XMM1 = _mm_mul_pd(XMM1, XMM7); _mm_store_pd((y)+i , XMM0); _mm_store_pd((y)+i+2, XMM1); } }
void increment_sse42(float arr[4]) { ALIGN_16 double darr[4]; __m128d val1 = _mm_set_pd(arr[0], arr[1]); __m128d val2 = _mm_set_pd(arr[2], arr[3]); __m128d one = _mm_set_pd(1.0, 1.0); __m128d result = _mm_add_pd(val1, one); _mm_store_pd(darr, result); result = _mm_add_pd(val2, one); _mm_store_pd(&darr[2], result); _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */ arr[0] = (float)darr[1]; arr[1] = (float)darr[0]; arr[2] = (float)darr[3]; arr[3] = (float)darr[2]; }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN; double *pA, *pB; register __m128d x, y, z, w,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); for (it = 0; it < N1; it=it+tilesize) { for (jt = 0; jt < N2; jt=jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; x = _mm_load_pd(pA); y = _mm_load_pd(pA + N2); x = _mm_mul_pd(x,fac_vector); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); k = (j-jt)*tilesize + (i-it); _mm_store_pd(buf + k,z); _mm_store_pd(buf + k + tilesize,w); } } k = 0; k1 = 0; for (j = jt; j < jt+tilesize; j++) { pB = b+j*N1+it; k = (j-jt)*tilesize; x = _mm_load_pd(&buf[k]); y = _mm_load_pd(&buf[k]+2); z = _mm_load_pd(&buf[k]+2*2); w = _mm_load_pd(&buf[k]+3*2); _mm_stream_pd(pB,x); _mm_stream_pd(pB+2,y); _mm_stream_pd(pB+2*2,z); _mm_stream_pd(pB+3*2,w); } } } }
static void clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; double max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_pd(max); for (; n >= 2; n -= 2) { __m128d xmm0; xmm0 = _mm_loadu_pd(src1); xmm0 = _mm_min_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; } for (; n > 0; n--) { double x = *src1++; if (x > max) x = max; *dest++ = x; } }
void trigo_vsin_vml_sse2(double* dst, const double* src, size_t length) { size_t i = length; while (i) { if (!SimdUtils::isAligned(dst, 16) || i == 1) { __m128d d = _mm_load_sd(src); _mm_store_sd(dst, sin_vml_pd(d)); dst++; src++; if (--i == 0) break; } while (i >= 2) { __m128d d = _mm_loadu_pd(src); _mm_store_pd(dst, sin_vml_pd(d)); dst += 2; src += 2; i -= 2; } } }
// *p += (*s) * v COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s, size_t n, double v) { #if defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) += (*s++) * v; n--; } case 0x00: for (; n >= 2; n -= 2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } break; default: for (; n >= 2; n-=2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } } #endif for (; n > 0; n--) (*p++) += (*s++) * v; return p; }
static void filterButter(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel) { #ifdef HAVE_SSE2 __m128d __kernel, __result, __temp; __declspec(align(16)) Float_t __temp2[2]; while (nSamples--) { __kernel = _mm_loadr_pd(&kernel[0]); __temp = _mm_loadu_pd(&input[-1]); __result = _mm_mul_pd(__temp, __kernel); __kernel = _mm_loadr_pd(&kernel[4]); __temp = _mm_loadu_pd(&output[-2]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); _mm_store_pd(__temp2, __result); *output = __temp2[0] + __temp2[1] + input [-2] * kernel[2]; ; ++output; ++input; } #else while (nSamples--) { *output = input [0] * kernel[0] - output[-1] * kernel[1] + input [-1] * kernel[2] - output[-2] * kernel[3] + input [-2] * kernel[4]; ++output; ++input; } #endif }
static double evaluateGTRCAT_SAVE (int *cptr, int *wptr, double *x1_start, double *x2_start, double *tipVector, unsigned char *tipX1, int n, double *diagptable_start, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap) { double sum = 0.0, term; int i; double *diagptable, *x1, *x2, *x1_ptr = x1_start, *x2_ptr = x2_start; if(tipX1) { for (i = 0; i < n; i++) { double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT))); __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2; x1 = &(tipVector[4 * tipX1[i]]); if(isGap(x2_gap, i)) x2 = x2_gapColumn; else { x2 = x2_ptr; x2_ptr += 4; } diagptable = &diagptable_start[4 * cptr[i]]; x1v1 = _mm_load_pd(&x1[0]); x1v2 = _mm_load_pd(&x1[2]); x2v1 = _mm_load_pd(&x2[0]); x2v2 = _mm_load_pd(&x2[2]); dv1 = _mm_load_pd(&diagptable[0]); dv2 = _mm_load_pd(&diagptable[2]); x1v1 = _mm_mul_pd(x1v1, x2v1); x1v1 = _mm_mul_pd(x1v1, dv1); x1v2 = _mm_mul_pd(x1v2, x2v2); x1v2 = _mm_mul_pd(x1v2, dv2); x1v1 = _mm_add_pd(x1v1, x1v2); _mm_store_pd(t, x1v1); term = LOG(FABS(t[0] + t[1])); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++)
void interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i; gdouble *o = op, *a = ap, *ic = icp; __m128d f[4], t[4]; const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride), (gdouble *) ((gint8 *) a + 1 * astride), (gdouble *) ((gint8 *) a + 2 * astride), (gdouble *) ((gint8 *) a + 3 * astride) }; f[0] = _mm_load1_pd (ic + 0); f[1] = _mm_load1_pd (ic + 1); f[2] = _mm_load1_pd (ic + 2); f[3] = _mm_load1_pd (ic + 3); for (i = 0; i < len; i += 2) { t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]); t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]); t[0] = _mm_add_pd (t[0], t[1]); t[2] = _mm_add_pd (t[2], t[3]); _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2])); } }
void gblas_quantizer::quantization(const double* input, double* output, int rows, int cols) { std::cerr << "Deprecated method: gblas_quantizer::quantization()" << std::endl; exit(0); // for (int i=0; i < rows; i++) // { // for (int j=0; j < cols; j++) // { // output[i*cols + j] = quantize_sample(&input[i*cols + j]); // } // } // for (int i=0; i < rows*cols; i++) // { // output[i] = (int)(input[i]/gblas_status.q_step + ZERO_DOT_FIVE); //quantize_sample(&input[i]); // } __m128d curr; __m128d inv_q_step = _mm_div_pd(_mm_set1_pd(1.0), _mm_set1_pd(q_step)); const double* in_p = input; double* out_p = output; for (int i=((rows*cols) >> 1); i > 0; i--) { curr = _mm_load_pd(in_p); in_p += 2; curr = _mm_mul_pd(curr, inv_q_step); curr = _mm_add_pd(curr, _MM_ZERO_DOT_FIVE_D); curr = _mm_cvtepi32_pd(_mm_cvttpd_epi32(curr)); _mm_store_pd(out_p, curr); out_p += 2; } }
static double* copy_block(int lda, int M, int N, double* A, double* new_A) { int M_even = turn_even(M); int N_even = turn_even(N); int i_step; __m128d a; for (int j=0; j<N; j++) { for (int i=0; i<M; i+=I_STRIDE) { i_step = min(I_STRIDE, M-i); if (i_step==1) { new_A[i+j*M_even] = A[i+j*lda]; } else { a = _mm_loadu_pd(A+i+j*lda); _mm_store_pd(new_A+i+j*M_even, a); } } } if (N % 2) { for (int i=0; i<M_even; i++) { new_A[i+(N_even-1)*M_even] = 0.0; } } return new_A; }
double dsquared_nrm2(unsigned int N, const double *x) { flops_counter += (2*N) ; #ifdef GX_SSE if(SSE2_supported) { __m128d X1, X2; __m128d acc1 = _mm_setzero_pd() ; __m128d acc2 = _mm_setzero_pd() ; SSE_ALIGNED(double temp[2]) ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ; X1 = _mm_load_pd(&x[i]) ; acc1 = _mm_add_pd(acc1, _mm_mul_pd(X1,X1)) ; i += 2 ; X2 = _mm_load_pd(&x[i]) ; acc2 = _mm_add_pd(acc2, _mm_mul_pd(X2,X2)) ; i += 2 ; } acc1 = _mm_add_pd(acc1, acc2) ; _mm_store_pd(temp, acc1) ; return temp[0] + temp[1] ; } #endif double result = 0.0 ; for(unsigned int i=0; i<N; i++) { result += x[i]*x[i] ; } return result ; }
template <> void Matrix<double>::set(double value) { #ifdef _DEBUG if(height==0 || width==0) throw std::invalid_argument("Impossible to set value for ghost matrix"); #endif double* x = pData; int n = width*height; int i; __m128d XMM0 = _mm_set1_pd(value); for (i = 0;i < (n);i += 8) { _mm_store_pd((x)+i , XMM0); _mm_store_pd((x)+i+2, XMM0); _mm_store_pd((x)+i+4, XMM0); _mm_store_pd((x)+i+6, XMM0); } }
void f2 (__m128d x) { struct S s; _mm_store_pd ((double *) &s.d, x); __real__ s.d *= 7.0; bar (s); }
void test_mm_store_pd(double* A, __m128d B) { // DAG-LABEL: test_mm_store_pd // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16 // // ASM-LABEL: test_mm_store_pd // ASM: movapd _mm_store_pd(A, B); }
/* xvm_neg: * Return the component-wise negation of the given vector: * r = -x */ void xvm_neg(double r[], const double x[], uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128d vz = _mm_setzero_pd(); for (uint64_t n = 0; n < N; n += 4) { const __m128d x0 = _mm_load_pd(x + n ); const __m128d x1 = _mm_load_pd(x + n + 2); const __m128d r0 = _mm_sub_pd(vz, x0); const __m128d r1 = _mm_sub_pd(vz, x1); _mm_store_pd(r + n, r0); _mm_store_pd(r + n + 2, r1); } #else for (uint64_t n = 0; n < N; n++) r[n] = -x[n]; #endif }
/* xvm_sub: * Return the difference of the two given vector: * r = x .- y */ void xvm_sub(double r[], const double x[], const double y[], uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); assert(y != NULL && ((uintptr_t)y % 16) == 0); for (uint64_t n = 0; n < N; n += 4) { const __m128d x0 = _mm_load_pd(x + n ); const __m128d x1 = _mm_load_pd(x + n + 2); const __m128d y0 = _mm_load_pd(y + n ); const __m128d y1 = _mm_load_pd(y + n + 2); const __m128d r0 = _mm_sub_pd(x0, y0); const __m128d r1 = _mm_sub_pd(x1, y1); _mm_store_pd(r + n, r0); _mm_store_pd(r + n + 2, r1); } #else for (uint64_t n = 0; n < N; n++) r[n] = x[n] - y[n]; #endif }
il_vec2 il_vec2_div(il_vec2 a, il_vec2 b, il_vec2 vec) { if (!vec) { vec = il_vec2_new(); } #ifdef IL_SSE _mm_store_pd(vec, _mm_div_pd(_mm_load_pd(a), _mm_load_pd(b))); #else vec[0] = a[0] / b[0]; vec[1] = a[1] / b[1]; #endif return vec; }
SSE_FUNCTION static void add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n) { __m128d xmm0, xmm1; while (((long)dest & 15) && (0 < n)) { *dest++ = *src1++ + *src2++; n--; } while (3 < n) { xmm0 = _mm_loadu_pd(src1); xmm1 = _mm_loadu_pd(src2); xmm0 = _mm_add_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); xmm0 = _mm_loadu_pd(src1+2); xmm1 = _mm_loadu_pd(src2+2); xmm0 = _mm_add_pd(xmm0, xmm1); _mm_store_pd(dest+2, xmm0); dest += 4; src1 += 4; src2 += 4; n -= 4; } while (1 < n) { xmm0 = _mm_loadu_pd(src1); xmm1 = _mm_loadu_pd(src2); xmm0 = _mm_add_pd(xmm0, xmm1); _mm_store_pd(dest, xmm0); dest += 2; src1 += 2; src2 += 2; n -= 2; } while (0 < n) { *dest++ = *src1++ + *src2++; n--; } }
int fft2_(double *a, double *b, int *m) { int i, i0, i1; /* double x0, y0, x1, y1; */ __m128d t0, t1; for (i = 0; i < *m ; i++) { i0 = i << 1; i1 = i0 + (*m << 1); /* x0 = a[i0]; y0 = a[i0 + 1]; x1 = a[i1]; y1 = a[i1 + 1]; */ t0 = _mm_load_pd(&a[i0]); t1 = _mm_load_pd(&a[i1]); /* b[i0] = x0 + x1; b[i0 + 1] = y0 + y1; b[i1] = x0 - x1; b[i1 + 1] = y0 - y1; */ _mm_store_pd(&b[i0], _mm_add_pd(t0, t1)); _mm_store_pd(&b[i1], _mm_sub_pd(t0, t1)); } return 0; }
/* xvm_axpy: * Return the sum of x scaled by a and y: * r = a * x + y */ void xvm_axpy(double r[], double a, const double x[], const double y[], uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); assert(y != NULL && ((uintptr_t)y % 16) == 0); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { const __m128d x0 = _mm_load_pd(x + n ); const __m128d x1 = _mm_load_pd(x + n + 2); const __m128d y0 = _mm_load_pd(y + n ); const __m128d y1 = _mm_load_pd(y + n + 2); const __m128d t0 = _mm_mul_pd(x0, va); const __m128d t1 = _mm_mul_pd(x1, va); const __m128d r0 = _mm_add_pd(t0, y0); const __m128d r1 = _mm_add_pd(t1, y1); _mm_store_pd(r + n, r0); _mm_store_pd(r + n + 2, r1); } #else for (uint64_t n = 0; n < N; n++) r[n] = a * x[n] + y[n]; #endif }
void interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i; gdouble *o = op, *a = ap, *ic = icp; __m128d f[2], t1, t2; const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride), (gdouble *) ((gint8 *) a + 1 * astride) }; f[0] = _mm_load1_pd (ic + 0); f[1] = _mm_load1_pd (ic + 1); for (i = 0; i < len; i += 4) { t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2)); t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]); t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]); _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2)); } }