Пример #1
0
static void
scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
{
  __m128d xmm1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = *src1++ * *val;
  }
  xmm1 = _mm_load_pd1(val);
  for (; n >= 4; n -= 4) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    xmm0 = _mm_loadu_pd(src1 + 2);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest + 2, xmm0);
    dest += 4;
    src1 += 4;
  }
  for (; n > 0; n--) {
    *dest++ = *src1++ * *val;
  }
}
Пример #2
0
void CAllPassFilterPair::processBlock(double* data, int numSamples)
{
	jassert((((size_t) data) & 0xF) == 0);
	jassert((_mm_getcsr() & 0x8040) == 0x8040);

	__m128d coeff = _mm_load_pd(md.getPtr(0));
	__m128d x1 = _mm_load_pd(md.getPtr(1));
	__m128d x2 = _mm_load_pd(md.getPtr(2));
	__m128d y1 = _mm_load_pd(md.getPtr(3));
	__m128d y2 = _mm_load_pd(md.getPtr(4));

	for (int i=0; i<numSamples; ++i)
	{
		__m128d x0 = _mm_load_pd(&(data[i+i]));
		__m128d tmp = _mm_sub_pd(x0, y2);
		tmp = _mm_mul_pd(tmp, coeff);
		__m128d y0 = _mm_add_pd(x2, tmp);

		_mm_store_pd(&(data[i+i]), y0);

		x2=x1;
		x1=x0;

		y2=y1;
		y1=y0;
	}

	_mm_store_pd(md.getPtr(1), x1);
	_mm_store_pd(md.getPtr(2), x2);
	_mm_store_pd(md.getPtr(3), y1);
	_mm_store_pd(md.getPtr(4), y2);

};
Пример #3
0
// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}
Пример #4
0
static void *
add_sse2_unroll(void *ptr)
{
    size_t n;
    char *p1, *p2, *p3;
    struct narray3 *a = (struct narray3*)ptr;
    dtype x, y;
    size_t i, n4;
    ssize_t s1, s2, s3;
    __m128d d1, d2, d3;
    __m128d e1, e2, e3;
    char *q1, *q2, *q3;

    p1 = a->x->ptr;
    p2 = a->y->ptr;
    p3 = a->z->ptr;
    n =  a->x->size;

    s1 = s2 = s3 = sizeof(dtype);

    q1 = p1 + s1*2;
    q2 = p2 + s2*2;
    q3 = p3 + s3*2;

    s1 = s2 = s3 = sizeof(dtype)*4;

    n4 = 3;
    n4 = (n & ~n4) - 2;
        e1 = _mm_load_pd((dtype*)q1);
        q1+=s1;
        e2 = _mm_load_pd((dtype*)q2);
        q2+=s2;
        e3 = _mm_add_pd(e1,e2);
    for (i=2; i<n4; i+=4) {
        d1 = _mm_load_pd((dtype*)p1);
        p1+=s1;
        d2 = _mm_load_pd((dtype*)p2);
        p2+=s2;
        d3 = _mm_add_pd(d1,d2);
        _mm_store_pd((dtype*)q3,e3);
        q3+=s3;
        e1 = _mm_load_pd((dtype*)q1);
        q1+=s1;
        e2 = _mm_load_pd((dtype*)q2);
        q2+=s2;
        e3 = _mm_add_pd(e1,e2);
        _mm_store_pd((dtype*)p3,d3);
        p3+=s3;
    }
    _mm_store_pd((dtype*)q3,e3);

    for (; i<n; i++) {
        x = *(dtype*)p1; p1+=s1;
        y = *(dtype*)p2; p2+=s2;
        x = x+y;
        *(dtype*)p3 = x; p3+=s3;
    }
    return 0;
}
Пример #5
0
int fft4a_(double *a, double *b, double *w, int *l)
{
    int j, j0, j1, j2, j3, j4, j5, j6, j7;
    /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */
    __m128d t0, t1, t2, t3, t4, w1, w2, w3;

    for (j = 0; j < *l; j++) {
	j0 = j << 1;
	j1 = j0 + (*l << 1);
	j2 = j1 + (*l << 1);
	j3 = j2 + (*l << 1);
	j4 = j << 3;
	j5 = j4 + 2;
	j6 = j5 + 2;
	j7 = j6 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	/* x0 = a[j0] + a[j2];
	y0 = a[j0 + 1] + a[j2 + 1];
	x1 = a[j0] - a[j2];
	y1 = a[j0 + 1] - a[j2 + 1];
	x2 = a[j1] + a[j3];
	y2 = a[j1 + 1] + a[j3 + 1];
	x3 = a[j1 + 1] - a[j3 + 1];
	y3 = a[j3] - a[j1]; */
	t0 = _mm_load_pd(&a[j0]);
	t2 = _mm_load_pd(&a[j2]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[j1]);
	t4 = _mm_load_pd(&a[j3]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* b[j4] = x0 + x2;
	b[j4 + 1] = y0 + y2;
	b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2);
	b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2);
	b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3);
	b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3);
	b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3);
	b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */
	_mm_store_pd(&b[j4], _mm_add_pd(t0, t2));
	_mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2)));
	_mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3)));
	_mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3)));
    }
    return 0;
}
/* This routine performs a dgemm operation
 * C := C + A * B
 * where A, B, and C are lda-by-lda matrices stored in row-major order
 * On exit, A and B maintain their input values. */  
void square_dgemm (int lda, double* A, double* B, double* C, int block_size)
{
	/* Do matrix padding first. */
	int step_size = UNROLLING_SIZE * 2;
	int new_size = lda + step_size - lda % step_size;
	double* old_C = C;
	int old_size = lda;
	A = matrix_padding(A, lda, new_size);
	B = matrix_padding(B, lda, new_size);
	// We don't need to copy data from old C to new C,
	// So we handle it separately here.
	posix_memalign((void**)&C, 16, sizeof(double)*new_size*new_size);
	__m128d v_zero = _mm_setzero_pd();
	for(int i=0; i<new_size*new_size; i+=10) {
		_mm_store_pd(C+i, v_zero);
		_mm_store_pd(C+i+2, v_zero);
		_mm_store_pd(C+i+4, v_zero);
		_mm_store_pd(C+i+6, v_zero);
		_mm_store_pd(C+i+8, v_zero);
	}
	lda = new_size;

#ifdef TRANSPOSE
for (int i = 0; i < lda; ++i)
	for (int j = 0; j < lda; ++j) {
		double t = B[i*lda+j];
		B[i*lda+j] = B[j*lda+i];
		B[j*lda+i] = t;
     }
#endif

	/* For each L1-block-row of A */ 
	for (int i = 0; i < lda; i += L2_BLOCK_SIZE) {
		int M = min (L2_BLOCK_SIZE, lda-i);
		/* For each L1-block-column of B */
		for (int j = 0; j < lda; j += L2_BLOCK_SIZE) {
			int N = min (L2_BLOCK_SIZE, lda-j);
			/* Accumulate L1-block dgemms into block of C */
			for (int k = 0; k < lda; k += L2_BLOCK_SIZE) {
			    /* Correct block dimensions if block "goes off edge of" the matrix. */
				int K = min (L2_BLOCK_SIZE, lda-k);
				/* Perform individual block dgemm */
				do_l2_block(lda, M, N, K, A + i*lda + k, B + k*lda + j, C + i*lda + j);
			 }
		}
	}
	// Copy computation result back to the original matrix
	copy_padding_back(old_size, old_C, lda, C);
	free(A);
	free(B);
}
Пример #7
0
int fft3a_(double *a, double *b, double *w, int *l)
{
    /* static double c31 = .86602540378443865;
    static double c32 = .5; */
    static __m128d c31, c32;

    int j, j0, j1, j2, j3, j4, j5;
    /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */
    __m128d t0, t1, t2, t3, w1, w2;

    c31 = _mm_set1_pd(0.86602540378443865);
    c32 = _mm_set1_pd(0.5);

    for (j = 0; j < *l; j++) {
        j0 = j << 1;
	j1 = j0 + (*l << 1);
	j2 = j1 + (*l << 1);
	j3 = j * 6;
	j4 = j3 + 2;
	j5 = j4 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	/* x0 = a[j1] + a[j2];
	y0 = a[j1 + 1] + a[j2 + 1];
	x1 = a[j0] - c32 * x0;
	y1 = a[j0 + 1] - c32 * y0;
	x2 = c31 * (a[j1 + 1] - a[j2 + 1]);
	y2 = c31 * (a[j2] - a[j1]); */
	t1 = _mm_load_pd(&a[j1]);
	t2 = _mm_load_pd(&a[j2]);
	t0 = _mm_add_pd(t1, t2);
	t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0));
	t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1));
	t3 = _mm_load_pd(&a[j0]);
	t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0));
	/* b[j3] = a[j0] + x0;
	b[j3 + 1] = a[j0 + 1] + y0;
	b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2);
	b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2);
	b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2);
	b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */
	_mm_store_pd(&b[j3], _mm_add_pd(t3, t0));
	_mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2)));
	_mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2)));
    }
    return 0;
}
Пример #8
0
void increment_sse41(float arr[4]) {
    double darr[4];
    __m128d val1 = _mm_set_pd(arr[0], arr[1]);
    __m128d val2 = _mm_set_pd(arr[2], arr[3]);
    __m128d one = _mm_set_pd(1.0, 1.0);
    __m128d result = _mm_add_pd(val1, one);
    result = _mm_ceil_pd(result); /* A no-op, only here to use a SSE4.1 intrinsic. */
    _mm_store_pd(darr, result);
    result = _mm_add_pd(val2, one);
    _mm_store_pd(&darr[2], result);
    arr[0] = (float)darr[1];
    arr[1] = (float)darr[0];
    arr[2] = (float)darr[3];
    arr[3] = (float)darr[2];
}
void Matrix<double>::multiply(double value)
{
	double* y = pData;
	int n = width*height;
    int i;
    __m128d XMM7 = _mm_set1_pd(value);
    for (i = 0;i < (n);i += 4) { 
        __m128d XMM0 = _mm_load_pd((y)+i  ); 
        __m128d XMM1 = _mm_load_pd((y)+i+2); 
        XMM0 = _mm_mul_pd(XMM0, XMM7); 
        XMM1 = _mm_mul_pd(XMM1, XMM7);
        _mm_store_pd((y)+i  , XMM0); 
        _mm_store_pd((y)+i+2, XMM1); 
    } 
}
Пример #10
0
void increment_sse42(float arr[4]) {
    ALIGN_16 double darr[4];
    __m128d val1 = _mm_set_pd(arr[0], arr[1]);
    __m128d val2 = _mm_set_pd(arr[2], arr[3]);
    __m128d one = _mm_set_pd(1.0, 1.0);
    __m128d result = _mm_add_pd(val1, one);
    _mm_store_pd(darr, result);
    result = _mm_add_pd(val2, one);
    _mm_store_pd(&darr[2], result);
    _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */
    arr[0] = (float)darr[1];
    arr[1] = (float)darr[0];
    arr[2] = (float)darr[3];
    arr[3] = (float)darr[2];
}
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN;
    double *pA, *pB;


    register __m128d x, y, z, w,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    for (it = 0; it < N1; it=it+tilesize) {
        for (jt = 0; jt < N2; jt=jt+tilesize) {

            k = 0;
            for (j = jt; j < jt+tilesize; j=j+2) {
                for (i = it; i < it+tilesize; i=i+2) {
                    pA = a+i*N2+j;
                    x = _mm_load_pd(pA);
                    y = _mm_load_pd(pA + N2);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    k = (j-jt)*tilesize + (i-it);
                    _mm_store_pd(buf + k,z);
                    _mm_store_pd(buf + k + tilesize,w);
                }
            }

            k = 0;
            k1 = 0;
            for (j = jt; j < jt+tilesize; j++) {
                pB = b+j*N1+it;
                k = (j-jt)*tilesize;
                x = _mm_load_pd(&buf[k]);
                y = _mm_load_pd(&buf[k]+2);
                z = _mm_load_pd(&buf[k]+2*2);
                w = _mm_load_pd(&buf[k]+3*2);
                _mm_stream_pd(pB,x);
                _mm_stream_pd(pB+2,y);
                _mm_stream_pd(pB+2*2,z);
                _mm_stream_pd(pB+3*2,w);

            }
        }
    }
}
Пример #12
0
static void
clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
{
  __m128d xmm1;
  double max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set1_pd(max);
  for (; n >= 2; n -= 2) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_min_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    dest += 2;
    src1 += 2;
  }
  for (; n > 0; n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
Пример #13
0
void trigo_vsin_vml_sse2(double* dst, const double* src, size_t length) {
    size_t i = length;

    while (i) {
        if (!SimdUtils::isAligned(dst, 16) || i == 1) {
            __m128d d = _mm_load_sd(src);
            _mm_store_sd(dst, sin_vml_pd(d));

            dst++;
            src++;

            if (--i == 0)
                break;
        }

        while (i >= 2) {
            __m128d d = _mm_loadu_pd(src);
            _mm_store_pd(dst, sin_vml_pd(d));

            dst += 2;
            src += 2;
            i -= 2;
        }
    }
}
Пример #14
0
// *p += (*s) * v
COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s,
	size_t n, double v)
{
#if defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++) * v; n--; }
	case 0x00:
		for (; n >= 2; n -= 2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
		break;
	default:
		for (; n >= 2; n-=2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
	}

#endif

	for (; n > 0; n--) (*p++) += (*s++) * v;
	return p;
}
Пример #15
0
static void
filterButter(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel)
{   
#ifdef HAVE_SSE2
    __m128d __kernel, __result, __temp;
    __declspec(align(16)) Float_t __temp2[2];

    while (nSamples--) {
        __kernel = _mm_loadr_pd(&kernel[0]);
        __temp = _mm_loadu_pd(&input[-1]);
        __result = _mm_mul_pd(__temp, __kernel);
        __kernel = _mm_loadr_pd(&kernel[4]);
        __temp = _mm_loadu_pd(&output[-2]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        _mm_store_pd(__temp2, __result);
        *output = __temp2[0]
                + __temp2[1]
                + input [-2] * kernel[2];
                ;
        ++output;
        ++input;
    }
#else
    while (nSamples--) {
        *output =  
               input [0]  * kernel[0] - output[-1] * kernel[1]
             + input [-1] * kernel[2] - output[-2] * kernel[3]
             + input [-2] * kernel[4];
        ++output;
        ++input;
    }
#endif
}
Пример #16
0
static double evaluateGTRCAT_SAVE (int *cptr, int *wptr,
				   double *x1_start, double *x2_start, double *tipVector, 		      
				   unsigned char *tipX1, int n, double *diagptable_start,
				   double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
{
  double  sum = 0.0, term;       
  int     i;

  double  *diagptable, 
    *x1, 
    *x2,
    *x1_ptr = x1_start,
    *x2_ptr = x2_start;
 
  if(tipX1)
    {           
      for (i = 0; i < n; i++) 
	{	
	  double t[2] __attribute__ ((aligned (BYTE_ALIGNMENT)));
	  __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;

	  x1 = &(tipVector[4 * tipX1[i]]);

	  if(isGap(x2_gap, i))
	    x2 = x2_gapColumn;
	  else
	    {
	      x2 = x2_ptr;
	      x2_ptr += 4;
	    }
	  
	  diagptable = &diagptable_start[4 * cptr[i]];
	  	    	  
	  x1v1 =  _mm_load_pd(&x1[0]);
	  x1v2 =  _mm_load_pd(&x1[2]);
	  x2v1 =  _mm_load_pd(&x2[0]);
	  x2v2 =  _mm_load_pd(&x2[2]);
	  dv1  =  _mm_load_pd(&diagptable[0]);
	  dv2  =  _mm_load_pd(&diagptable[2]);
	  
	  x1v1 = _mm_mul_pd(x1v1, x2v1);
	  x1v1 = _mm_mul_pd(x1v1, dv1);
	  
	  x1v2 = _mm_mul_pd(x1v2, x2v2);
	  x1v2 = _mm_mul_pd(x1v2, dv2);
	  
	  x1v1 = _mm_add_pd(x1v1, x1v2);
	  
	  _mm_store_pd(t, x1v1);
	  	  
	  term = LOG(FABS(t[0] + t[1]));
	      
	 

	  sum += wptr[i] * term;
	}	
    }               
  else
    {
      for (i = 0; i < n; i++) 
void
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i;
  gdouble *o = op, *a = ap, *ic = icp;
  __m128d f[4], t[4];
  const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
    (gdouble *) ((gint8 *) a + 1 * astride),
    (gdouble *) ((gint8 *) a + 2 * astride),
    (gdouble *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_load1_pd (ic + 0);
  f[1] = _mm_load1_pd (ic + 1);
  f[2] = _mm_load1_pd (ic + 2);
  f[3] = _mm_load1_pd (ic + 3);

  for (i = 0; i < len; i += 2) {
    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
    t[0] = _mm_add_pd (t[0], t[1]);
    t[2] = _mm_add_pd (t[2], t[3]);
    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
  }
}
Пример #18
0
void gblas_quantizer::quantization(const double* input, double* output, int rows, int cols)
{
  std::cerr << "Deprecated method: gblas_quantizer::quantization()" << std::endl;
  exit(0);
  
  //  for (int i=0; i < rows; i++)
  //  {
  //    for (int j=0; j < cols; j++)
  //    {
  //      output[i*cols + j] = quantize_sample(&input[i*cols + j]);
  //    }
  //  }
  
  //  for (int i=0; i < rows*cols; i++)
  //  {
  //    output[i] = (int)(input[i]/gblas_status.q_step + ZERO_DOT_FIVE); //quantize_sample(&input[i]);
  //  }
  __m128d curr;
  __m128d inv_q_step  = _mm_div_pd(_mm_set1_pd(1.0), _mm_set1_pd(q_step));
  const double* in_p  = input;
  double* out_p = output;
  
  for (int i=((rows*cols) >> 1); i > 0; i--)
  {
    curr = _mm_load_pd(in_p); in_p += 2;
    curr = _mm_mul_pd(curr, inv_q_step);    
    curr = _mm_add_pd(curr, _MM_ZERO_DOT_FIVE_D);
    curr = _mm_cvtepi32_pd(_mm_cvttpd_epi32(curr));
    _mm_store_pd(out_p, curr);  out_p += 2;
  }
}
Пример #19
0
static double* copy_block(int lda, int M, int N, double* A, double* new_A) {

    int M_even = turn_even(M);
    int N_even = turn_even(N);
    int i_step;
    __m128d a;

    for (int j=0; j<N; j++) {
        for (int i=0; i<M; i+=I_STRIDE) {
            i_step = min(I_STRIDE, M-i);
            if (i_step==1) {            
                new_A[i+j*M_even] = A[i+j*lda];
            } else {
                a = _mm_loadu_pd(A+i+j*lda);
                _mm_store_pd(new_A+i+j*M_even, a);
            }
        }
    }
    if (N % 2) {
        for (int i=0; i<M_even; i++) {
            new_A[i+(N_even-1)*M_even] = 0.0;
        }
    } 
    return new_A;
}
Пример #20
0
        double  dsquared_nrm2(unsigned int N, const double *x) {
            flops_counter += (2*N) ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d X1, X2;
                __m128d acc1 = _mm_setzero_pd() ;
                __m128d acc2 = _mm_setzero_pd() ;
                SSE_ALIGNED(double temp[2]) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    acc1 = _mm_add_pd(acc1, _mm_mul_pd(X1,X1)) ;
                    i += 2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    acc2 = _mm_add_pd(acc2, _mm_mul_pd(X2,X2)) ;
                    i += 2 ;
                }
                acc1 = _mm_add_pd(acc1, acc2) ;
                _mm_store_pd(temp, acc1)  ;
                return temp[0] + temp[1] ;
            }
#endif
            double result = 0.0 ;
            for(unsigned int i=0; i<N; i++) {
                result += x[i]*x[i] ;
            }
            return result ;
        }
template <> void Matrix<double>::set(double value)
{
#ifdef _DEBUG
	if(height==0 || width==0)
		throw std::invalid_argument("Impossible to set value for ghost matrix");
#endif
	double* x = pData;
	int n = width*height;
    int i;
    __m128d XMM0 = _mm_set1_pd(value);
    for (i = 0;i < (n);i += 8) { 
        _mm_store_pd((x)+i  , XMM0); 
        _mm_store_pd((x)+i+2, XMM0); 
        _mm_store_pd((x)+i+4, XMM0); 
        _mm_store_pd((x)+i+6, XMM0); 
    } 
}
Пример #22
0
void
f2 (__m128d x)
{
  struct S s;
  _mm_store_pd ((double *) &s.d, x);
  __real__ s.d *= 7.0;
  bar (s);
}
Пример #23
0
void test_mm_store_pd(double* A, __m128d B) {
  // DAG-LABEL: test_mm_store_pd
  // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
  //
  // ASM-LABEL: test_mm_store_pd
  // ASM: movapd
  _mm_store_pd(A, B);
}
Пример #24
0
/* xvm_neg:
 *   Return the component-wise negation of the given vector:
 *       r = -x
 */
void xvm_neg(double r[], const double x[], uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	const __m128d vz = _mm_setzero_pd();
	for (uint64_t n = 0; n < N; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d r0 = _mm_sub_pd(vz, x0);
		const __m128d r1 = _mm_sub_pd(vz, x1);
		_mm_store_pd(r + n,     r0);
		_mm_store_pd(r + n + 2, r1);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = -x[n];
#endif
}
Пример #25
0
/* xvm_sub:
 *   Return the difference of the two given vector:
 *       r = x .- y
 */
void xvm_sub(double r[], const double x[], const double y[], uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	assert(y != NULL && ((uintptr_t)y % 16) == 0);
	for (uint64_t n = 0; n < N; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d y0 = _mm_load_pd(y + n    );
		const __m128d y1 = _mm_load_pd(y + n + 2);
		const __m128d r0 = _mm_sub_pd(x0, y0);
		const __m128d r1 = _mm_sub_pd(x1, y1);
		_mm_store_pd(r + n,     r0);
		_mm_store_pd(r + n + 2, r1);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = x[n] - y[n];
#endif
}
Пример #26
0
il_vec2 il_vec2_div(il_vec2 a, il_vec2 b, il_vec2 vec)
{
     if (!vec) {
         vec = il_vec2_new();
     }
#ifdef IL_SSE
    _mm_store_pd(vec, _mm_div_pd(_mm_load_pd(a), _mm_load_pd(b)));
#else
    vec[0] = a[0] / b[0];
    vec[1] = a[1] / b[1];
#endif
    return vec;
}
Пример #27
0
SSE_FUNCTION static void
add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
{
    __m128d xmm0, xmm1;
    while (((long)dest & 15) && (0 < n)) {
        *dest++ = *src1++ + *src2++;
        n--;
    }
    while (3 < n) {
        xmm0 = _mm_loadu_pd(src1);
        xmm1 = _mm_loadu_pd(src2);
        xmm0 = _mm_add_pd(xmm0, xmm1);
        _mm_store_pd(dest, xmm0);

        xmm0 = _mm_loadu_pd(src1+2);
        xmm1 = _mm_loadu_pd(src2+2);
        xmm0 = _mm_add_pd(xmm0, xmm1);
        _mm_store_pd(dest+2, xmm0);
        dest += 4;
        src1 += 4;
        src2 += 4;
        n -= 4;
    }
    while (1 < n) {
        xmm0 = _mm_loadu_pd(src1);
        xmm1 = _mm_loadu_pd(src2);
        xmm0 = _mm_add_pd(xmm0, xmm1);
        _mm_store_pd(dest, xmm0);
        dest += 2;
        src1 += 2;
        src2 += 2;
        n -= 2;
    }
    while (0 < n) {
        *dest++ = *src1++ + *src2++;
        n--;
    }
}
Пример #28
0
int fft2_(double *a, double *b, int *m)
{
    int i, i0, i1;
    /* double x0, y0, x1, y1; */
    __m128d t0, t1;

    for (i = 0; i < *m ; i++) {
	i0 = i << 1;
	i1 = i0 + (*m << 1);
	/* x0 = a[i0];
	y0 = a[i0 + 1];
	x1 = a[i1];
	y1 = a[i1 + 1]; */
	t0 = _mm_load_pd(&a[i0]);
	t1 = _mm_load_pd(&a[i1]);
	/* b[i0] = x0 + x1;
	b[i0 + 1] = y0 + y1;
	b[i1] = x0 - x1;
	b[i1 + 1] = y0 - y1; */
	_mm_store_pd(&b[i0], _mm_add_pd(t0, t1));
	_mm_store_pd(&b[i1], _mm_sub_pd(t0, t1));
    }
    return 0;
}
Пример #29
0
/* xvm_axpy:
 *   Return the sum of x scaled by a and y:
 *       r = a * x + y
 */
void xvm_axpy(double r[], double a, const double x[], const double y[],
		uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	assert(y != NULL && ((uintptr_t)y % 16) == 0);
	const __m128d va = _mm_set1_pd(a);
	for (uint64_t n = 0; n < N; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d y0 = _mm_load_pd(y + n    );
		const __m128d y1 = _mm_load_pd(y + n + 2);
		const __m128d t0 = _mm_mul_pd(x0, va);
		const __m128d t1 = _mm_mul_pd(x1, va);
		const __m128d r0 = _mm_add_pd(t0, y0);
		const __m128d r1 = _mm_add_pd(t1, y1);
		_mm_store_pd(r + n,     r0);
		_mm_store_pd(r + n + 2, r1);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = a * x[n] + y[n];
#endif
}
void
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i;
  gdouble *o = op, *a = ap, *ic = icp;
  __m128d f[2], t1, t2;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
    (gdouble *) ((gint8 *) a + 1 * astride)
  };

  f[0] = _mm_load1_pd (ic + 0);
  f[1] = _mm_load1_pd (ic + 1);

  for (i = 0; i < len; i += 4) {
    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));

    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
  }
}