Ejemplos de _mm_setzero_pd en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: vmath.c Proyecto: OpenEdition/bilbo

/* xvm_dot:
 *   Return the dot product of the two given vectors.
 */
double xvm_dot(const double x[], const double y[], uint64_t N) {
	double r = 0.0;
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	assert(y != NULL && ((uintptr_t)y % 16) == 0);
	uint64_t n, d = N % 4;
	__m128d s0 = _mm_setzero_pd();
	__m128d s1 = _mm_setzero_pd();
	for (n = 0; n < N - d; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d y0 = _mm_load_pd(y + n    );
		const __m128d y1 = _mm_load_pd(y + n + 2);
		const __m128d r0 = _mm_mul_pd(x0, y0);
		const __m128d r1 = _mm_mul_pd(x1, y1);
		s0 = _mm_add_pd(s0, r0);
		s1 = _mm_add_pd(s1, r1);
	}
	s0 = _mm_add_pd(s0, s1);
	s1 = _mm_shuffle_pd(s0, s0, _MM_SHUFFLE2(1, 1));
	s0 = _mm_add_pd(s0, s1);
	_mm_store_sd(&r, s0);
	for ( ; n < N; n++)
		r += x[n] * y[n];
#else
	for (uint64_t n = 0; n < N; n++)
		r += x[n] * y[n];
#endif
	return r;
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: chi2double.c Proyecto: antran89/BoW_frameworks

/* use compiler intrinsics for 2x parallel processing */
static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
    double result=0;
    const __m128d eps = _mm_set1_pd(DBL_MIN);
    const __m128d zero = _mm_setzero_pd();
    __m128d chi2 = _mm_setzero_pd();    

    for ( ; n>1; n-=2) {
        const __m128d a = _mm_loadu_pd(x);
        const __m128d b = _mm_loadu_pd(y);
	x+=2;
	y+=2;
        const __m128d a_plus_b = _mm_add_pd(a,b);
        const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
        const __m128d a_minus_b = _mm_sub_pd(a,b);
        const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
        const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_pd(chi2, quotient);
    }
    const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
    const __m128d sum = _mm_add_pd(chi2, shuffle);
// with SSE3, we could use hadd_pd, but the difference is negligible 

    _mm_store_sd(&result,sum);
    _mm_empty();
    if (n)
        result += chi2_baseline_double(n, x, y); // remaining entries
    return result;
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: quick_blas.cpp Proyecto: Amy1014/shape-packing

        double  dsquared_nrm2(unsigned int N, const double *x) {
            flops_counter += (2*N) ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d X1, X2;
                __m128d acc1 = _mm_setzero_pd() ;
                __m128d acc2 = _mm_setzero_pd() ;
                SSE_ALIGNED(double temp[2]) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&x[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    acc1 = _mm_add_pd(acc1, _mm_mul_pd(X1,X1)) ;
                    i += 2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    acc2 = _mm_add_pd(acc2, _mm_mul_pd(X2,X2)) ;
                    i += 2 ;
                }
                acc1 = _mm_add_pd(acc1, acc2) ;
                _mm_store_pd(temp, acc1)  ;
                return temp[0] + temp[1] ;
            }
#endif
            double result = 0.0 ;
            for(unsigned int i=0; i<N; i++) {
                result += x[i]*x[i] ;
            }
            return result ;
        }

Ejemplo n.º 4

0

Mostrar archivo

double	vector_ps_double (const double* pa,const double* pb,size_t n)
{
    size_t k;
    /* multiplication 4 par 4 */
    size_t q = n / 4;
    size_t r = n % 4;
    double w;
    _mm_prefetch (pa,_MM_HINT_NTA);
    _mm_prefetch (pb,_MM_HINT_NTA);
    if (q > 0) {
	__m128d acc1 = _mm_setzero_pd();
	__m128d acc2 = _mm_setzero_pd();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_load_pd(pa);
		__m128d j1 = _mm_load_pd(pb);
		__m128d i2 = _mm_load_pd(pa+2);
		__m128d j2 = _mm_load_pd(pb+2);
		/* incrément de 4 doubles en tout (2 pour i et 2 pour j) */
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_loadu_pd(pa);
		__m128d j1 = _mm_loadu_pd(pb);
		__m128d i2 = _mm_loadu_pd(pa+2);
		__m128d j2 = _mm_loadu_pd(pb+2);
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_pd(acc1,acc2);
	acc1 = _mm_hadd_pd(acc1,acc1);
	_mm_store_sd(&w,acc1);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: FFT.cpp Proyecto: MSRA-SE/Individual-Projects

	void int_to_fft(__m128d *T, int k, const unsigned __int64 *A, size_t AL)
	{
		size_t fft_length = 1 << k;
		__m128d *Tstop = T + fft_length;
		if (fft_length < 8 * AL)
			throw "FFT length is too small.";
		for (size_t c = 0; c < AL; c++){
			unsigned __int64 word = A[c];
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word & 0xff);
			word >>= 8;
			*T++ = _mm_set_sd(word);
		}
		while (T < Tstop)
			*T++ = _mm_setzero_pd();
	}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: multsum_sse.c Proyecto: thewb/mokoiax

static void
multsum_f64_sse2_unroll4(double *dest,
     const double *src1, int sstr1,
     const double *src2, int sstr2,
     int n)
{
  __m128d t1, t2;
  union {
    __m128d reg;
    double vals[2];
  } sum;
  int i = 0;

  sum.reg = _mm_setzero_pd();
  while (i < n-3) {
    MULTSUM_SSE2_STRIDED(0);
    MULTSUM_SSE2_STRIDED(2);

    OIL_INCREMENT(src1, 4*sstr1);
    OIL_INCREMENT(src2, 4*sstr2);
    i += 4;
  }
  while (i < n-1) {
    MULTSUM_SSE2_STRIDED(0);

    OIL_INCREMENT(src1, 2*sstr1);
    OIL_INCREMENT(src2, 2*sstr2);
    i+=2;
  }
  *dest = sum.vals[0] + sum.vals[1];
  if (i < n) {
    *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double));
  }

Ejemplo n.º 7

0

Mostrar archivo

Archivo: audio-resampler-x86-sse2.c Proyecto: GrokImageCompression/gst-plugins-base

static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum[2], t;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_pd ();

  for (; i < len; i += 4) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
    t = _mm_loadu_pd (a + i + 2);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
  }
  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: sse2-builtins.c Proyecto: lucasmrthomaz/clang

__m128d test_mm_setzero_pd() {
  // DAG-LABEL: test_mm_setzero_pd
  // DAG: store <2 x double> zeroinitializer
  //
  // ASM-LABEL: test_mm_setzero_pd
  // ASM: xorps
  return _mm_setzero_pd();
}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: Main.cpp Proyecto: MarkoFilipovic/Playground

void dotmul_intrinsic(double *A, double *B, double &C, int SIZE)
{
  register int k;
  double sarr[2];

  register __m128d partial_sum = _mm_setzero_pd();
  register __m128d catch_multiplication = _mm_setzero_pd();

  for(k = 0; k < SIZE; k += 2)
  {
    // load 64 bit data (2 x double)
    register __m128d a = _mm_load_pd(&A[k]);
    register __m128d b = _mm_load_pd(&B[k]);

    catch_multiplication = _mm_mul_pd(a, b);
    partial_sum = _mm_add_pd(partial_sum, catch_multiplication);
  }

  _mm_stream_pd(sarr, partial_sum);
  C = sarr[0] + sarr[1];
}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: dgemm-blocked.c Proyecto: CHANGLI0914/ParallelComputingProj_UCSD

/* This routine performs a dgemm operation
 * C := C + A * B
 * where A, B, and C are lda-by-lda matrices stored in row-major order
 * On exit, A and B maintain their input values. */  
void square_dgemm (int lda, double* A, double* B, double* C, int block_size)
{
	/* Do matrix padding first. */
	int step_size = UNROLLING_SIZE * 2;
	int new_size = lda + step_size - lda % step_size;
	double* old_C = C;
	int old_size = lda;
	A = matrix_padding(A, lda, new_size);
	B = matrix_padding(B, lda, new_size);
	// We don't need to copy data from old C to new C,
	// So we handle it separately here.
	posix_memalign((void**)&C, 16, sizeof(double)*new_size*new_size);
	__m128d v_zero = _mm_setzero_pd();
	for(int i=0; i<new_size*new_size; i+=10) {
		_mm_store_pd(C+i, v_zero);
		_mm_store_pd(C+i+2, v_zero);
		_mm_store_pd(C+i+4, v_zero);
		_mm_store_pd(C+i+6, v_zero);
		_mm_store_pd(C+i+8, v_zero);
	}
	lda = new_size;

#ifdef TRANSPOSE
for (int i = 0; i < lda; ++i)
	for (int j = 0; j < lda; ++j) {
		double t = B[i*lda+j];
		B[i*lda+j] = B[j*lda+i];
		B[j*lda+i] = t;
     }
#endif

	/* For each L1-block-row of A */ 
	for (int i = 0; i < lda; i += L2_BLOCK_SIZE) {
		int M = min (L2_BLOCK_SIZE, lda-i);
		/* For each L1-block-column of B */
		for (int j = 0; j < lda; j += L2_BLOCK_SIZE) {
			int N = min (L2_BLOCK_SIZE, lda-j);
			/* Accumulate L1-block dgemms into block of C */
			for (int k = 0; k < lda; k += L2_BLOCK_SIZE) {
			    /* Correct block dimensions if block "goes off edge of" the matrix. */
				int K = min (L2_BLOCK_SIZE, lda-k);
				/* Perform individual block dgemm */
				do_l2_block(lda, M, N, K, A + i*lda + k, B + k*lda + j, C + i*lda + j);
			 }
		}
	}
	// Copy computation result back to the original matrix
	copy_padding_back(old_size, old_C, lda, C);
	free(A);
	free(B);
}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: quick_blas.cpp Proyecto: Amy1014/shape-packing

        void dzero(unsigned int N, double* y) {
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Z = _mm_setzero_pd() ;
                for(unsigned int i=0; i<N; i+=4) {
                    _mm_stream_pd(&y[i], Z) ;
                    _mm_stream_pd(&y[i + 2], Z) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            memset(y, 0, N*sizeof(double)) ;
        }

Ejemplo n.º 12

0

Mostrar archivo

Archivo: zdotc.c Proyecto: anaptyxis/libflame

dcomplex zdotc_( int*      n,
                 dcomplex* x, int* inc_x,
                 dcomplex* z, int* inc_z )
{
	dcomplex* restrict x1;
	dcomplex* restrict z1;
	int                i;
	v2df_t rho1v;
	v2df_t z11v, z12v;
	v2df_t x1v, x1rv;
	dcomplex rho;
	int    n1 = *n;
	int    incx = *inc_x;
	int    incz = *inc_z;

	x1 = x;
	z1 = z;

	rho1v.v = _mm_setzero_pd();

	{
		v2df_t bcac, adbd;

		for ( i = 0; i < n1; ++i )
		{
			z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
			z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );

			x1v.v  = _mm_load_pd( ( double* )x1 );
			x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
			bcac.v = x1rv.v * z11v.v;
			adbd.v = x1v.v  * z12v.v;
			rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );

			x1 += incx;
			z1 += incz;
		}

		rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );

		rho1v.d[1] = -rho1v.d[1];
	}

	rho.real = rho1v.d[0];
	rho.imag = rho1v.d[1];

	return rho;
}

Ejemplo n.º 13

0

Mostrar archivo

Archivo: vmath.c Proyecto: OpenEdition/bilbo

/* xvm_neg:
 *   Return the component-wise negation of the given vector:
 *       r = -x
 */
void xvm_neg(double r[], const double x[], uint64_t N) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(r != NULL && ((uintptr_t)r % 16) == 0);
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	const __m128d vz = _mm_setzero_pd();
	for (uint64_t n = 0; n < N; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d r0 = _mm_sub_pd(vz, x0);
		const __m128d r1 = _mm_sub_pd(vz, x1);
		_mm_store_pd(r + n,     r0);
		_mm_store_pd(r + n + 2, r1);
	}
#else
	for (uint64_t n = 0; n < N; n++)
		r[n] = -x[n];
#endif
}

Ejemplo n.º 14

0

Mostrar archivo

Archivo: dgemm-blocked.c Proyecto: CHANGLI0914/ParallelComputingProj_UCSD

/* This function malloc new aligned memory space for matrix and 
 * then copy the original values into it. The new matrix's size is 
 * a multiple of 8, which makes it easier to handle the boundry.
 * The new matrix's layout is like this:
 *		[[C O],
 *		 [O O]]
 * */
double* matrix_padding(double* old_matrix, int old_size, int new_size){
	double* new_matrix;
	/* Allocate aligned space according to the new size*/
	posix_memalign((void**)&new_matrix, 16, sizeof(double)*new_size*new_size);
	/* Copy data.
	 * Handle odd/even old size sepatately to avoid if-branches in
	 * any loops.
	 */
	if(old_size%2 == 1) {
		for(int i=0; i<old_size; i++) {
			for(int j=0; j<old_size - 1; j+=2) {
				__m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j);
				_mm_store_pd(new_matrix + i*new_size + j, v1);
			}
			new_matrix[i*new_size+old_size-1]=old_matrix[(i+1)*old_size-1];
			for(int j=old_size; j<new_size; j++) {
				new_matrix[i*new_size + j] = 0;
			}
	     }
	}else {
	    for(int i=0; i<old_size; i++) {
			for(int j=0; j<old_size; j+=2) {
				__m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j);
				_mm_store_pd(new_matrix + i*new_size + j, v1);
			}
			for(int j=old_size; j<new_size; j++) {
				new_matrix[i*new_size + j] = 0;
			}
		}
	}
	/* Set extra space with ZERO. */
	__m128d v_zero = _mm_setzero_pd();
	for(int i=old_size; i<new_size; i++) {
		double* addr = new_matrix + i * new_size;
		for(int j=0; j<new_size; j+=10) {
			_mm_store_pd(addr+j, v_zero);
			_mm_store_pd(addr+j+2, v_zero);
			_mm_store_pd(addr+j+4, v_zero);
			_mm_store_pd(addr+j+6, v_zero);
			_mm_store_pd(addr+j+8, v_zero);
		}
	}
	return new_matrix;
}

Ejemplo n.º 15

0

Mostrar archivo

Archivo: gain_analysis.c Proyecto: MestreLion/wavegain

int
ResetSampleFrequency ( long samplefreq ) {
    int  i;

    // zero out initial values
    for ( i = 0; i < MAX_ORDER; i++ )
        linprebuf[i] = lstepbuf[i] = loutbuf[i] = rinprebuf[i] = rstepbuf[i] = routbuf[i] = 0.;

    switch ( (int)(samplefreq) ) {
        case 96000: freqindex = 0; break;
        case 88200: freqindex = 1; break;
        case 64000: freqindex = 2; break;
        case 48000: freqindex = 3; break;
        case 44100: freqindex = 4; break;
        case 32000: freqindex = 5; break;
        case 24000: freqindex = 6; break;
        case 22050: freqindex = 7; break;
        case 16000: freqindex = 8; break;
        case 12000: freqindex = 9; break;
        case 11025: freqindex = 10; break;
        case  8000: freqindex = 11; break;
        default:    return INIT_GAIN_ANALYSIS_ERROR;
    }

    sampleWindow = (int) ceil (samplefreq / RMS_WINDOW_TIME);

#ifdef HAVE_SSE2
    lrsum = _mm_setzero_pd();
#else
    lsum         = 0.;
    rsum         = 0.;
#endif
    totsamp      = 0;
    memset ( A, 0, sizeof(A) );

    return INIT_GAIN_ANALYSIS_OK;
}

Ejemplo n.º 16

0

Mostrar archivo

Archivo: audio-resampler-x86-sse2.c Proyecto: GrokImageCompression/gst-plugins-base

static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i;
  __m128d f[2], sum[4], t;
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride),
    (gdouble *) ((gint8 *) b + 2 * bstride),
    (gdouble *) ((gint8 *) b + 3 * bstride)
  };

  f[0] = _mm_loadu_pd (icoeff + 0);
  f[1] = _mm_loadu_pd (icoeff + 2);
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();

  for (i = 0; i < len; i += 2) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
  }
  sum[0] =
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
  sum[1] =
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
  sum[2] =
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
  sum[3] =
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[2] = _mm_add_pd (sum[2], sum[3]);
  sum[0] = _mm_add_pd (sum[0], sum[2]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}

Ejemplo n.º 17

0

Mostrar archivo

Archivo: computeCellValues.c Proyecto: tusharuiit/2014-2015_ComputationalFluidDynamicsLab

void computeVelocitySSE(const double * const currentCell, const double * const density, double *velocity)
{
    __m128d v0, v1, v2;
    int i;
    v0 = v1 = v2 = _mm_setzero_pd();
    for (i = 0; i < PARAMQ - 1; i += 2)
    {
        __m128d vc, vl0, vl1, vl2;
        __m128i vtemp;

        vc = _mm_loadu_pd(&currentCell[i]);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[0][i]);
        vl0 = _mm_cvtepi32_pd(vtemp);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[1][i]);
        vl1 = _mm_cvtepi32_pd(vtemp);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[2][i]);
        vl2 = _mm_cvtepi32_pd(vtemp);
        v0 = _mm_add_pd(v0, _mm_mul_pd(vc, vl0));
        v1 = _mm_add_pd(v1, _mm_mul_pd(vc, vl1));
        v2 = _mm_add_pd(v2, _mm_mul_pd(vc, vl2));
    }
    v0 = _mm_hadd_pd(v0, v0);
    v1 = _mm_hadd_pd(v1, v1);
    v2 = _mm_hadd_pd(v2, v2);
    _mm_store_sd (&velocity[0], v0);
    _mm_store_sd (&velocity[1], v1);
    _mm_store_sd (&velocity[2], v2);
    if (i < PARAMQ)
    {
        velocity[0] += currentCell[i] * LATTICEVELOCITIES2[0][i];
        velocity[1] += currentCell[i] * LATTICEVELOCITIES2[1][i];
        velocity[2] += currentCell[i] * LATTICEVELOCITIES2[2][i];
    }
    velocity[0] = velocity[0] / (*density);
    velocity[1] = velocity[1] / (*density);
    velocity[2] = velocity[2] / (*density);
}

Ejemplo n.º 18

0

Mostrar archivo

Archivo: gain_analysis.c Proyecto: MestreLion/wavegain

Float_t
GetTitleGain ( void )
{
    Float_t  retval;
    int    i;

    retval = analyzeResult ( A, sizeof(A)/sizeof(*A) );

    for ( i = 0; i < (int)(sizeof(A)/sizeof(*A)); i++ ) {
        B[i] += A[i];
        A[i]  = 0;
    }

    for ( i = 0; i < MAX_ORDER; i++ )
        linprebuf[i] = lstepbuf[i] = loutbuf[i] = rinprebuf[i] = rstepbuf[i] = routbuf[i] = 0.f;

    totsamp = 0;
#ifdef HAVE_SSE2
    lrsum = _mm_setzero_pd();
#else
    lsum    = rsum = 0.;
#endif
    return retval;
}

Ejemplo n.º 19

0

Mostrar archivo

Archivo: audio-resampler-x86-sse2.c Proyecto: GrokImageCompression/gst-plugins-base

static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum = _mm_setzero_pd ();

  for (; i < len; i += 8) {
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
            _mm_load_pd (b + i + 0)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
            _mm_load_pd (b + i + 2)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
            _mm_load_pd (b + i + 4)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
            _mm_load_pd (b + i + 6)));
  }
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
  _mm_store_sd (o, sum);
}

Ejemplo n.º 20

0

Mostrar archivo

Archivo: evaluateGenericSpecial.c Proyecto: KhaosResearch/MORPHY

static double evaluateGTRCATPROT (int *cptr, int *wptr,
				  double *x1, double *x2, double *tipVector,
				  unsigned char *tipX1, int n, double *diagptable_start)
{
  double   sum = 0.0, term;
  double  *diagptable,  *left, *right;
  int     i, l;                           
  
  if(tipX1)
    {                 
      for (i = 0; i < n; i++) 
	{	       	
	  left = &(tipVector[20 * tipX1[i]]);
	  right = &(x2[20 * i]);
	  
	  diagptable = &diagptable_start[20 * cptr[i]];	           	 

	  __m128d tv = _mm_setzero_pd();	    
	  
	  for(l = 0; l < 20; l+=2)
	    {
	      __m128d lv = _mm_load_pd(&left[l]);
	      __m128d rv = _mm_load_pd(&right[l]);
	      __m128d mul = _mm_mul_pd(lv, rv);
	      __m128d dv = _mm_load_pd(&diagptable[l]);
	      
	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
	    }		 		
	  
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
  
	  
	  term = LOG(FABS(term));
	  	  
	  sum += wptr[i] * term;
	}      
    }    
  else
    {
    
      for (i = 0; i < n; i++) 
	{		       	      	      
	  left  = &x1[20 * i];
	  right = &x2[20 * i];
	  
	  diagptable = &diagptable_start[20 * cptr[i]];	  	

	  __m128d tv = _mm_setzero_pd();	    
	      	    
	  for(l = 0; l < 20; l+=2)
	    {
	      __m128d lv = _mm_load_pd(&left[l]);
	      __m128d rv = _mm_load_pd(&right[l]);
	      __m128d mul = _mm_mul_pd(lv, rv);
	      __m128d dv = _mm_load_pd(&diagptable[l]);
	      
	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
	    }		 		
	  
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
	  	  
	  term = LOG(FABS(term));	 
	  
	  sum += wptr[i] * term;      
	}
    }
             
  return  sum;         
}

Ejemplo n.º 21

0

Mostrar archivo

Archivo: evaluateGenericSpecial.c Proyecto: KhaosResearch/MORPHY

static double evaluateGTRGAMMAPROT (int *wptr,
				    double *x1, double *x2,  
				    double *tipVector, 
				    unsigned char *tipX1, int n, double *diagptable)
{
  double   sum = 0.0, term;        
  int     i, j, l;   
  double  *left, *right;              
  
  if(tipX1)
    {               
      for (i = 0; i < n; i++) 
	{

	  __m128d tv = _mm_setzero_pd();
	  left = &(tipVector[20 * tipX1[i]]);	  	  
	  
	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      right = &(x2[80 * i + 20 * j]);
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
	  
	  
	 
	  term = LOG(0.25 * FABS(term));
		 
	  
	  sum += wptr[i] * term;
	}    	        
    }              
  else
    {
      for (i = 0; i < n; i++) 
	{	  	 	             
	  __m128d tv = _mm_setzero_pd();	 	  	  
	      
	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      left  = &(x1[80 * i + 20 * j]);
	      right = &(x2[80 * i + 20 * j]);
	      
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);	  
	  
	
	  term = LOG(0.25 * FABS(term));
	  
	  
	  sum += wptr[i] * term;
	}
    }
       
  return  sum;
}

Ejemplo n.º 22

0

Mostrar archivo

Archivo: evaluateGenericSpecial.c Proyecto: KhaosResearch/MORPHY

static double evaluateGTRGAMMAPROT_GAPPED_SAVE (int *wptr,
						double *x1, double *x2,  
						double *tipVector, 
						unsigned char *tipX1, int n, double *diagptable, 
						double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)					   
{
  double   sum = 0.0, term;        
  int     i, j, l;   
  double  
    *left, 
    *right,
    *x1_ptr = x1,
    *x2_ptr = x2,
    *x1v,
    *x2v;              
  
  if(tipX1)
    {               
      for (i = 0; i < n; i++) 
	{
	  if(x2_gap[i / 32] & mask32[i % 32])
	    x2v = x2_gapColumn;
	  else
	    {
	      x2v = x2_ptr;
	      x2_ptr += 80;
	    }

	  __m128d tv = _mm_setzero_pd();
	  left = &(tipVector[20 * tipX1[i]]);	  	  
	  
	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      right = &(x2v[20 * j]);
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }

	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
	  

	  
	  term = LOG(0.25 * FABS(term));	  
	  
	  sum += wptr[i] * term;
	}    	        
    }              
  else
    {
      for (i = 0; i < n; i++) 
	{
	  if(x1_gap[i / 32] & mask32[i % 32])
	    x1v = x1_gapColumn;
	  else
	    {
	      x1v = x1_ptr;
	      x1_ptr += 80;
	    }

	  if(x2_gap[i / 32] & mask32[i % 32])
	    x2v = x2_gapColumn;
	  else
	    {
	      x2v = x2_ptr;
	      x2_ptr += 80;
	    }
	  	 	             
	  __m128d tv = _mm_setzero_pd();	 	  	  
	      
	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      left  = &(x1v[20 * j]);
	      right = &(x2v[20 * j]);
	      
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);	  
	  
	 
	  term = LOG(0.25 * FABS(term));
	
	  
	  sum += wptr[i] * term;
	}         
    }
       
  return  sum;
}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: evaluateGenericSpecial.c Proyecto: KhaosResearch/MORPHY

static double evaluateGTRCATPROT_SAVE (int *cptr, int *wptr,
				       double *x1, double *x2, double *tipVector,
				       unsigned char *tipX1, int n, double *diagptable_start, 
				       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
{
  double   
    sum = 0.0, 
    term,
    *diagptable,  
    *left, 
    *right,
    *left_ptr = x1,
    *right_ptr = x2;
  
  int     
    i, 
    l;                           
  
  if(tipX1)
    {                 
      for (i = 0; i < n; i++) 
	{	       	
	  left = &(tipVector[20 * tipX1[i]]);

	  if(isGap(x2_gap, i))
	    right = x2_gapColumn;
	  else
	    {
	      right = right_ptr;
	      right_ptr += 20;
	    }	  	 
	  
	  diagptable = &diagptable_start[20 * cptr[i]];	           	 

	  __m128d tv = _mm_setzero_pd();	    
	  
	  for(l = 0; l < 20; l+=2)
	    {
	      __m128d lv = _mm_load_pd(&left[l]);
	      __m128d rv = _mm_load_pd(&right[l]);
	      __m128d mul = _mm_mul_pd(lv, rv);
	      __m128d dv = _mm_load_pd(&diagptable[l]);
	      
	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
	    }		 		
	  
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
    
	  
	  term = LOG(FABS(term));
	  	  
	  sum += wptr[i] * term;
	}      
    }    
  else
    {
    
      for (i = 0; i < n; i++) 
	{		       	      	      	  
	  if(isGap(x1_gap, i))
	    left = x1_gapColumn;
	  else
	    {
	      left = left_ptr;
	      left_ptr += 20;
	    }
	  
	  if(isGap(x2_gap, i))
	    right = x2_gapColumn;
	  else
	    {
	      right = right_ptr;
	      right_ptr += 20;
	    }
	  
	  diagptable = &diagptable_start[20 * cptr[i]];	  	

	  __m128d tv = _mm_setzero_pd();	    
	  
	  for(l = 0; l < 20; l+=2)
	    {
	      __m128d lv = _mm_load_pd(&left[l]);
	      __m128d rv = _mm_load_pd(&right[l]);
	      __m128d mul = _mm_mul_pd(lv, rv);
	      __m128d dv = _mm_load_pd(&diagptable[l]);
	      
	      tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));		   
	    }		 		
	  
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);
	  	  
	  term = LOG(FABS(term));	 
	  
	  sum += wptr[i] * term;      
	}
    }
             
  return  sum;         
}

Ejemplo n.º 24

0

Mostrar archivo

Archivo: center.c Proyecto: ChayaSt/mdtraj

void inplace_center_and_trace_atom_major(float* coords, float* traces, const int n_frames, const int n_atoms)
{
    /* Center a trajectory containing multiple conformations inplace.
       The coordinates are store in float, but the accumulation is done in
       double.

       Also compute the traces of the centered conformations, which are necessary
       for RMSD.
    */ 
    int i, k;
    float* confp;
    __m128d sx_, sy_, sz_, trace_;
    __m128 mux_, muy_, muz_;
    float sxf, syf, szf;
    double sx[2], sy[2], sz[2], trace[2];
    __m128 x, y, z, x2, y2, z2;

    #ifdef _OPENMP
    #pragma omp parallel for default(none) shared(coords, traces) \
        private(sx_, sy_, sz_, trace_, mux_, muy_, muz_, sxf, syf, szf, \
        confp, i, x, y, z, x2, y2, z2, sx, sy, sz, trace)
    #endif
    for (k = 0; k < n_frames; k++) {
        confp = &coords[k * n_atoms * 3];
        sx_ = sy_ = sz_ = trace_ = _mm_setzero_pd();
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);

            /* accumulate the sums of each coordinate in double */
            /* get the first two values from each float4 */
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(x));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(y));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(z));
            /* and shuffle in the second two values */
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(_mm_movehl_ps(x, x)));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(_mm_movehl_ps(y, y)));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(_mm_movehl_ps(z, z)));
            confp += 12;
        }
        /* copy the summed coordinates out of the SSE registers */
        _mm_storeu_pd(sx, sx_);
        _mm_storeu_pd(sy, sy_);
        _mm_storeu_pd(sz, sz_);

        /* Add the last couple entries that weren't a factor of four */
        for (i = 0; i < n_atoms % 4; i++) {
            sx[0] += confp[i*3 + 0];
            sy[0] += confp[i*3 + 1];
            sz[0] += confp[i*3 + 2];
        }

        /* Put everything into the first value. We're doing this here, as */
        /* opposed to using a SSE horizontal add. */
        sx[0] += sx[1];
        sy[0] += sy[1];
        sz[0] += sz[1];

        /* Now we want mean x, y, and z positions */
        sx[0] /= n_atoms;
        sy[0] /= n_atoms;
        sz[0] /= n_atoms;

        /* Load these mean positions back into the SSE registers */
        sxf = (float) sx[0];
        syf = (float) sy[0];
        szf = (float) sz[0];
        mux_ = _mm_load1_ps(&sxf);
        muy_ = _mm_load1_ps(&syf);
        muz_ = _mm_load1_ps(&szf);

        /* And subtract them out */
        confp = &coords[k * n_atoms * 3];
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);
            x = _mm_sub_ps(x, mux_);
            y = _mm_sub_ps(y, muy_);
            z = _mm_sub_ps(z, muz_);

            x2 = _mm_mul_ps(x, x);
            y2 = _mm_mul_ps(y, y);
            z2 = _mm_mul_ps(z, z);
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(x2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(y2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(z2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(x2, x2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(y2, y2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(z2, z2)));

            aos_interleaved_storeu(confp, x, y, z);
            confp += 12;
        }
        _mm_storeu_pd(trace, trace_);

        for (i = 0; i < n_atoms % 4; i++) {
            confp[i*3 + 0] -= sxf;
            confp[i*3 + 1] -= syf;
            confp[i*3 + 2] -= szf;
            trace[0] += confp[i*3 + 0]*confp[i*3 + 0];
            trace[0] += confp[i*3 + 1]*confp[i*3 + 1];
            trace[0] += confp[i*3 + 2]*confp[i*3 + 2];
        }
        trace[0] += trace[1];
        if (traces != NULL)
            traces[k] = (float) trace[0];
    }
}

Ejemplo n.º 25

0

Mostrar archivo

Archivo: intrin.cpp Proyecto: alagenchev/school_code

void convert_dword_to_double(double* result, UINT32 i) {
    __m128d mmin, mmout;
    mmin  = _mm_setzero_pd();
    mmout = _mm_cvtsi32_sd(mmin,i);
    _mm_store_sd(result, mmout);
}

Ejemplo n.º 26

0

Mostrar archivo

Archivo: avxfma4noise.cpp Proyecto: wfpokorny/povray

void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}

Ejemplo n.º 27

0

Mostrar archivo

Archivo: avxfma4noise.cpp Proyecto: wfpokorny/povray

DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}

Ejemplo n.º 28

0

Mostrar archivo

Archivo: evaluateGenericSpecial.c Proyecto: joerhau/mymaster

static double evaluateGTRGAMMAPROT (int *ex1, int *ex2, int *wptr,
				    double *x1, double *x2,  
				    double *tipVector, 
				    unsigned char *tipX1, int n, double *diagptable, const boolean fastScaling)
{
  double   sum = 0.0, term;
  int     i, j, l;   
  double  *left, *right;

  assertionError = 0;

  if(tipX1)
    {
      for (i = 0; i < n; i++) 
	{

	  __m128d tv = _mm_setzero_pd();

	  left = &(tipVector[20 * tipX1[i]]);	  	  
	  


	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      right = &(x2[80 * i + 20 * j]);
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }
	  tv = _mm_hadd_pd(tv, tv);

	  _mm_storel_pd(&term, tv);


//		  [JH] sometimes term contains -0.0000.... which causes the log to become NaN
	  if(term < 0.0) {
		  assertionError = 1;
		  problemCount++;
		  printf("tipX1 i=%d, term=%E\n", i, term);
		term = fabs(term);
	  }

	  if(fastScaling)
	    term = LOG(0.25 * term);
	  else
	    term = LOG(0.25 * term) + (ex2[i] * LOG(minlikelihood));

	  sum += wptr[i] * term;
	}    	        
    }              
  else
    {
      for (i = 0; i < n; i++) 
	{	  	 	             
	  __m128d tv = _mm_setzero_pd();	 	  	  
	      
	  for(j = 0, term = 0.0; j < 4; j++)
	    {
	      double *d = &diagptable[j * 20];
	      left  = &(x1[80 * i + 20 * j]);
	      right = &(x2[80 * i + 20 * j]);
	      
	      for(l = 0; l < 20; l+=2)
		{
		  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
		  tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));		   
		}		 		
	    }
	  tv = _mm_hadd_pd(tv, tv);
	  _mm_storel_pd(&term, tv);	  
	  
//		  [JH] sometimes term contains -0.0000.... which causes the log to become NaN
	  	  if(term < 0.0) {
	  		  assertionError = 1;
	  		  problemCount++;
	  		  printf("nontip term=%E\n", term);
	  		term = fabs(term);
	  	  }

	  if(fastScaling)
	    term = LOG(0.25 * term);
	  else
	    term = LOG(0.25 * term) + ((ex1[i] + ex2[i])*LOG(minlikelihood));
	  
	  sum += wptr[i] * term;
	}
    }
       
  return  sum;
}

Ejemplo n.º 29

0

Mostrar archivo

Archivo: evaluatePartialGenericSpecial.c Proyecto: ajnebro/MO-Phylogenetics

static inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
        traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector,
        unsigned  char **yVector, int mxtips)
{
    double
    *x1,
    *x2,
    *x3;

    int
    s,
    pNumber = ti->pNumber,
    rNumber = ti->rNumber,
    qNumber = ti->qNumber,
    index1[4],
    index2[4];


    x3  = &(lVector[80 * (pNumber  - mxtips)]);

    switch(ti->tipCase)
    {
    case TIP_TIP:
        x1 = &(tipVector[20 * yVector[qNumber][i]]);
        x2 = &(tipVector[20 * yVector[rNumber][i]]);
        for(s = 0; s < 4; s++)
        {
            index1[s] = 0;
            index2[s] = 0;
        }
        break;
    case TIP_INNER:
        x1 = &(tipVector[20 * yVector[qNumber][i]]);
        x2 = &(  lVector[80 * (rNumber - mxtips)]);
        for(s = 0; s < 4; s++)
            index1[s] = 0;
        for(s = 0; s < 4; s++)
            index2[s] = s;
        break;
    case INNER_INNER:
        x1 = &(lVector[80 * (qNumber - mxtips)]);
        x2 = &(lVector[80 * (rNumber - mxtips)]);
        for(s = 0; s < 4; s++)
        {
            index1[s] = s;
            index2[s] = s;
        }
        break;
    default:
        assert(0);
    }

    {
        double
        e1[20] __attribute__ ((aligned (BYTE_ALIGNMENT))),
        e2[20] __attribute__ ((aligned (BYTE_ALIGNMENT))),
        d1[20] __attribute__ ((aligned (BYTE_ALIGNMENT))),
        d2[20] __attribute__ ((aligned (BYTE_ALIGNMENT))),
        lz1, lz2;

        int
        l,
        k,
        scale,
        j;

        for(j = 0; j < 4; j++)
        {
            lz1 = qz * gammaRates[j];
            lz2 = rz * gammaRates[j];

            e1[0] = 1.0;
            e2[0] = 1.0;

            for(l = 1; l < 20; l++)
            {
                e1[l] = EXP(EIGN[l] * lz1);
                e2[l] = EXP(EIGN[l] * lz2);
            }

            for(l = 0; l < 20; l+=2)
            {
                __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[20 * index1[j] + l]), _mm_load_pd(&e1[l]));
                __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[20 * index2[j] + l]), _mm_load_pd(&e2[l]));

                _mm_store_pd(&d1[l], d1v);
                _mm_store_pd(&d2[l], d2v);
            }

            __m128d zero = _mm_setzero_pd();

            for(l = 0; l < 20; l+=2)
                _mm_store_pd(&x3[j * 20 + l], zero);

            for(l = 0; l < 20; l++)
            {
                double *ev = &EV[l * 20];
                __m128d ump_x1v = _mm_setzero_pd();
                __m128d ump_x2v = _mm_setzero_pd();
                __m128d x1px2v;

                for(k = 0; k < 20; k+=2)
                {
                    __m128d eiv = _mm_load_pd(&EI[20 * l + k]);
                    __m128d d1v = _mm_load_pd(&d1[k]);
                    __m128d d2v = _mm_load_pd(&d2[k]);
                    ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
                    ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));
                }

                ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
                ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);

                x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);

                for(k = 0; k < 20; k+=2)
                {
                    __m128d ex3v = _mm_load_pd(&x3[j * 20 + k]);
                    __m128d EVV  = _mm_load_pd(&ev[k]);
                    ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));

                    _mm_store_pd(&x3[j * 20 + k], ex3v);
                }
            }
        }

        scale = 1;
        for(l = 0; scale && (l < 80); l++)
            scale = ((x3[l] < minlikelihood) && (x3[l] > minusminlikelihood));

        if(scale)
        {
            __m128d twoto = _mm_set_pd(twotothe256, twotothe256);

            for(l = 0; l < 80; l+=2)
            {
                __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
                _mm_store_pd(&x3[l], ex3v);
            }

            *eVector = *eVector + 1;
        }

        return;
    }
}

Ejemplo n.º 30

0

Mostrar archivo

// 3x3 mult a=b^{\dagger}.c//
void 
multabdag( GLU_complex a[ NCNC ] , 
	   const GLU_complex b[ NCNC ] , 
	   const GLU_complex c[ NCNC ] )
{
  // recast to alignment
  __m128d *A = (__m128d*)a ;
  const __m128d *B = (const __m128d*)b ;
  const __m128d *C = (const __m128d*)c ;
#if NC==3
  // a[0] = conj( b[0] ) * c[0] + conj( b[3] ) * c[3] + conj( b[6] ) * c[6] ;
  *( A + 0 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 0 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 3 ) , *( C + 3 ) ) ,
				       SSE2_MULCONJ( *( B + 6 ) , *( C + 6 ) ) ) ) ;
  //a[1] = conj( b[0] ) * c[1] + conj( b[3] ) * c[4] + conj( b[6] ) * c[7] ;
  *( A + 1 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 0 ) , *( C + 1 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 3 ) , *( C + 4 ) ) ,
				       SSE2_MULCONJ( *( B + 6 ) , *( C + 7 ) ) ) ) ;
  //a[2] = conj( b[0] ) * c[2] + conj( b[3] ) * c[5] + conj( b[6] ) * c[8] ;
  *( A + 2 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 0 ) , *( C + 2 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 3 ) , *( C + 5 ) ) ,
				       SSE2_MULCONJ( *( B + 6 ) , *( C + 8 ) ) ) ) ;
  // middle row
  //a[3] = conj( b[1] ) * c[0] + conj( b[4] ) * c[3] + conj( b[7] ) * c[6] ;
  *( A + 3 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 1 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 4 ) , *( C + 3 ) ) ,
				       SSE2_MULCONJ( *( B + 7 ) , *( C + 6 ) ) ) ) ;
  //a[4] = conj( b[1] ) * c[1] + conj( b[4] ) * c[4] + conj( b[7] ) * c[7] ;
  *( A + 4 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 1 ) , *( C + 1 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 4 ) , *( C + 4 ) ) ,
				       SSE2_MULCONJ( *( B + 7 ) , *( C + 7 ) ) ) ) ;
  //a[5] = conj( b[1] ) * c[2] + conj( b[4] ) * c[5] + conj( b[7] ) * c[8] ;
  *( A + 5 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 1 ) , *( C + 2 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 4 ) , *( C + 5 ) ) ,
				       SSE2_MULCONJ( *( B + 7 ) , *( C + 8 ) ) ) ) ;
  //a[6] = conj( b[2] ) * c[0] + conj( b[5] ) * c[3] + conj( b[8] ) * c[6] ; 
  *( A + 6 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 2 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 5 ) , *( C + 3 ) ) ,
				       SSE2_MULCONJ( *( B + 8 ) , *( C + 6 ) ) ) ) ;
  //a[7] = conj( b[2] ) * c[1] + conj( b[5] ) * c[4] + conj( b[8] ) * c[7] ; 
  *( A + 7 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 2 ) , *( C + 1 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 5 ) , *( C + 4 ) ) ,
				       SSE2_MULCONJ( *( B + 8 ) , *( C + 7 ) ) ) ) ;
  //a[8] = conj( b[2] ) * c[2] + conj( b[5] ) * c[5] + conj( b[8] ) * c[8] ;
  *( A + 8 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 2 ) , *( C + 2 ) ) ,
			   _mm_add_pd( SSE2_MULCONJ( *( B + 5 ) , *( C + 5 ) ) ,
				       SSE2_MULCONJ( *( B + 8 ) , *( C + 8 ) ) ) ) ;
#elif NC==2
  //a[0] = conj( b[0] ) * c[0] + conj( b[2] ) * c[2] ;
  *( A + 0 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 0 ) , *( C + 0 ) ) ,
			   SSE2_MULCONJ( *( B + 2 ) , *( C + 2 ) ) ) ;
  //a[1] = conj( b[0] ) * c[1] + conj( b[2] ) * c[3] ;
  *( A + 1 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 0 ) , *( C + 1 ) ) ,
			   SSE2_MULCONJ( *( B + 2 ) , *( C + 3 ) ) ) ;
  //a[2] = conj( b[1] ) * c[0] + conj( b[3] ) * c[2] ;
  *( A + 2 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 1 ) , *( C + 0 ) ) ,
			   SSE2_MULCONJ( *( B + 3 ) , *( C + 2 ) ) ) ;
  //a[3] = conj( b[1] ) * c[1] + conj( b[3] ) * c[3] ;
  *( A + 3 ) = _mm_add_pd( SSE2_MULCONJ( *( B + 1 ) , *( C + 1 ) ) ,
			   SSE2_MULCONJ( *( B + 3 ) , *( C + 3 ) ) ) ;
#else
  size_t i , j , m ;
  register __m128d sum ;
  for( i = 0 ; i < NC ; i++ ) {
    for( j = 0 ; j < NC ; j++ ) {
      sum = _mm_setzero_pd( ) ;
      for( m = 0 ; m < NC ; m++ ) {
	sum = _mm_add_pd( sum , SSE2_MULCONJ( *( B + i + NC*m ) , *( C + j + NC*m ) ) ) ;
      }
      *( A + j + NC*i ) = sum ;
    }
  }
#endif
  return ;
}