Exemplo n.º 1
0
double ddot(const int N, const double *a, const int incx, const double *b, const int incy)
{
	int i;
	
	vtype q00 = set_vector(0.); 
	vtype q01 = set_vector(0.); 
	vtype q0a, q1a;
	vtype q0b, q1b;
	//
	double c;
	//
	for (i = 0; i < N - N%4; i = i + 4)
	{
		q0a = LOAD(a + i);
		q0b = LOAD(b + i);
		q00 = vfmaq_f64(q00, q0a, q0b);	
		//q0a = vmulq_f64(q0a, q0b);
		//q00 = vaddq_f64(q0a, q00);
		//
		q0a = LOAD(a + i + 2);
                q1b = LOAD(b + i + 2);
                q01 = vfmaq_f64(q01, q0a, q0b);
                //q1a = vmulq_f64(q1a, q1b);
                //q01 = vaddq_f64(q1a, q01);
		//c += a [i]*b [i];
	}
	c = q00[0] + q00[1] + q01[0] + q01[1];
	return c;
}
Exemplo n.º 2
0
float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
  // CHECK: test_vfmaq_f64
  return vfmaq_f64(a1, a2, a3);
  // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}})
  // CHECK-NEXT: ret
}
Exemplo n.º 3
0
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
	BLASLONG i=0;
	BLASLONG ix=0,iy=0;

	FLOAT  dot = 0.0 ;

	if ( n < 0 )  return(dot);

	if ( (inc_x == 1) && (inc_y == 1) )
	{

                float64x2_t vdot0 = {0.0, 0.0};
                float64x2_t vdot1 = {0.0, 0.0};
                float64x2_t vdot2 = {0.0, 0.0};
                float64x2_t vdot3 = {0.0, 0.0};
                float64x2_t *vx = (float64x2_t*)x;
                float64x2_t *vy = (float64x2_t*)y;
#if 0
		prefetch(x + 128/sizeof(*x));
		prefetch(y + 128/sizeof(*y));
#endif
		prefetch(x + 2*128/sizeof(*x));
		prefetch(y + 2*128/sizeof(*y));
		prefetch(x + 3*128/sizeof(*x));
		prefetch(y + 3*128/sizeof(*y));

		int n1 = n&-8;

		while(i < n1)
		{
#if 0
			vdot0 = vfmaq_f64 (vdot0,
					   vy[0],
					   vx[0]);
			vdot1 = vfmaq_f64 (vdot1,
					   vy[1],
					   vx[1]);
			vdot2 = vfmaq_f64 (vdot2,
					   vy[2],
					   vx[2]);
			vdot3 = vfmaq_f64 (vdot3,
					   vy[3],
					   vx[3]);
#else
			vdot0 = vy[0] * vx[0] + vdot0;
			vdot1 = vy[1] * vx[1] + vdot1;
			vdot2 = vy[2] * vx[2] + vdot2;
			vdot3 = vy[3] * vx[3] + vdot3;
#endif
			vy += 4;
			vx += 4;
			i += 8;
			prefetch(vx + 3*128/sizeof(*x));
			prefetch(vy + 3*128/sizeof(*y));

		}
		dot = vaddvq_f64 (vdot0 + vdot1);
		dot += vaddvq_f64 (vdot2 + vdot3);
		i = n1;

		while(i < n)
		{
			dot += y[i] * x[i] ;
			i++ ;

		}
		return(dot);


	}

	while(i < n)
	{
		dot += y[iy] * x[ix] ;
		ix  += inc_x ;
		iy  += inc_y ;
		i++ ;

	}
	return(dot);

}