double ddot(const int N, const double *a, const int incx, const double *b, const int incy) { int i; vtype q00 = set_vector(0.); vtype q01 = set_vector(0.); vtype q0a, q1a; vtype q0b, q1b; // double c; // for (i = 0; i < N - N%4; i = i + 4) { q0a = LOAD(a + i); q0b = LOAD(b + i); q00 = vfmaq_f64(q00, q0a, q0b); //q0a = vmulq_f64(q0a, q0b); //q00 = vaddq_f64(q0a, q00); // q0a = LOAD(a + i + 2); q1b = LOAD(b + i + 2); q01 = vfmaq_f64(q01, q0a, q0b); //q1a = vmulq_f64(q1a, q1b); //q01 = vaddq_f64(q1a, q01); //c += a [i]*b [i]; } c = q00[0] + q00[1] + q01[0] + q01[1]; return c; }
float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) { // CHECK: test_vfmaq_f64 return vfmaq_f64(a1, a2, a3); // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}}) // CHECK-NEXT: ret }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot = 0.0 ; if ( n < 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { float64x2_t vdot0 = {0.0, 0.0}; float64x2_t vdot1 = {0.0, 0.0}; float64x2_t vdot2 = {0.0, 0.0}; float64x2_t vdot3 = {0.0, 0.0}; float64x2_t *vx = (float64x2_t*)x; float64x2_t *vy = (float64x2_t*)y; #if 0 prefetch(x + 128/sizeof(*x)); prefetch(y + 128/sizeof(*y)); #endif prefetch(x + 2*128/sizeof(*x)); prefetch(y + 2*128/sizeof(*y)); prefetch(x + 3*128/sizeof(*x)); prefetch(y + 3*128/sizeof(*y)); int n1 = n&-8; while(i < n1) { #if 0 vdot0 = vfmaq_f64 (vdot0, vy[0], vx[0]); vdot1 = vfmaq_f64 (vdot1, vy[1], vx[1]); vdot2 = vfmaq_f64 (vdot2, vy[2], vx[2]); vdot3 = vfmaq_f64 (vdot3, vy[3], vx[3]); #else vdot0 = vy[0] * vx[0] + vdot0; vdot1 = vy[1] * vx[1] + vdot1; vdot2 = vy[2] * vx[2] + vdot2; vdot3 = vy[3] * vx[3] + vdot3; #endif vy += 4; vx += 4; i += 8; prefetch(vx + 3*128/sizeof(*x)); prefetch(vy + 3*128/sizeof(*y)); } dot = vaddvq_f64 (vdot0 + vdot1); dot += vaddvq_f64 (vdot2 + vdot3); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } return(dot); } while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); }