void diff(spinor * const Q,spinor * const R,spinor * const S, const int N) { int ix = 1; double *s ALIGN; double *sp ALIGN; double *r ALIGN; double *rp ALIGN; double *q ALIGN; double _Complex x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11; double _Complex y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11; #pragma disjoint(*R, *S) __alignx(16, Q); __alignx(16, R); __alignx(16, S); r = (double*) R; s = (double*) S; q = (double*) Q; rp = r + 24; sp = s + 24; _prefetch_spinor(rp); _prefetch_spinor(sp); x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); s = sp; r = rp; q+=24; #pragma unroll(12) for(ix = 1; ix < N-1; ix++) { rp+=24; sp+=24; _prefetch_spinor(rp); _prefetch_spinor(sp); x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); s = sp; r = rp; q+=24; } x00 = __lfpd(r); x01 = __lfpd(r+2); x02 = __lfpd(r+4); x03 = __lfpd(r+6); x04 = __lfpd(r+8); x05 = __lfpd(r+10); x06 = __lfpd(r+12); x07 = __lfpd(r+14); x08 = __lfpd(r+16); x09 = __lfpd(r+18); x10 = __lfpd(r+20); x11 = __lfpd(r+22); y00 = __lfpd(s); y01 = __lfpd(s+2); y02 = __lfpd(s+4); y03 = __lfpd(s+6); y04 = __lfpd(s+8); y05 = __lfpd(s+10); y06 = __lfpd(s+12); y07 = __lfpd(s+14); y08 = __lfpd(s+16); y09 = __lfpd(s+18); y10 = __lfpd(s+20); y11 = __lfpd(s+22); __stfpd(q, __fpsub(x00, y00)); __stfpd(q+2, __fpsub(x01, y01)); __stfpd(q+4, __fpsub(x02, y02)); __stfpd(q+6, __fpsub(x03, y03)); __stfpd(q+8, __fpsub(x04, y04)); __stfpd(q+10, __fpsub(x05, y05)); __stfpd(q+12, __fpsub(x06, y06)); __stfpd(q+14, __fpsub(x07, y07)); __stfpd(q+16, __fpsub(x08, y08)); __stfpd(q+18, __fpsub(x09, y09)); __stfpd(q+20, __fpsub(x10, y10)); __stfpd(q+22, __fpsub(x11, y11)); return; }
int main(int argc, char* argv[]) { fprintf(stderr,"BEGIN TESTING OF DP ACCUMULATE\n"); printf("%18s %18s %18s %18s %18s %18s\n","dim","basic","1-hummer","2-hummers","4-hummers","8-hummers"); int k; for (k=6;k<20;k++) { int dim = pow(2,k); int count = 10; int i,j; unsigned long long t0, t1; unsigned long long dt0, dt1, dt2, dt3, dt4; double* a; double* b; double* c; double scale = 0.1; posix_memalign((void**)&a, 16*sizeof(double), dim*sizeof(double)); posix_memalign((void**)&b, 16*sizeof(double), dim*sizeof(double)); posix_memalign((void**)&c, 16*sizeof(double), dim*sizeof(double)); for (i=0;i<dim;i++) a[i] = 1.0 - 2*(double)rand()/(double)RAND_MAX; fprintf(stderr,"BASIC VERSION\n"); // WARM-UP for (i=0;i<dim;i++) b[i] = 0.0; for (i=0;i<dim;i++) b[i] += scale*a[i]; // TIMING for (i=0;i<dim;i++) b[i] = 0.0; t0 = getticks(); for (j=0;j<count;j++) { for (i=0;i<dim;i++) b[i] += scale*a[i]; } t1 = getticks(); dt0 = t1 - t0; fprintf(stderr,"INTRINSICS VERSION 1\n"); // WARM-UP for (i=0;i<dim;i++) c[i] = 0.0; for (i=0;i<dim;i+=2) { __stfpd(&c[i], __fxcpmadd( __lfpd(&c[i]), __lfpd(&a[i]), scale) ); } // TIMING for (i=0;i<dim;i++) c[i] = 0.0; t0 = getticks(); for (j=0;j<count;j++) { for (i=0;i<dim;i+=2) { __stfpd(&c[i], __fxcpmadd( __lfpd(&c[i]), __lfpd(&a[i]), scale) ); } } t1 = getticks(); dt1 = t1 - t0; // VERIFICATION for (i=0;i<dim;i++) { if (b[i] != c[i]) { printf("%4d %30.15f %30.15f\n",i,b[i],c[i]); } } fprintf(stderr,"INTRINSICS VERSION 2\n"); // WARM-UP for (i=0;i<dim;i++) c[i] = 0.0; for (i=0;i<dim;i+=4) { { double _Complex a0, a2, c0, c2; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+2]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+2]); c0 = __fxcpmadd(c0,a0,scale); c2 = __fxcpmadd(c2,a2,scale); __stfpd(&c[i ],c0); __stfpd(&c[i+2],c2); } } // TIMING for (i=0;i<dim;i++) c[i] = 0.0; t0 = getticks(); for (j=0;j<count;j++) { for (i=0;i<dim;i+=4) { { double _Complex a0, a2, c0, c2; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+2]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+2]); c0 = __fxcpmadd(c0,a0,scale); c2 = __fxcpmadd(c2,a2,scale); __stfpd(&c[i ],c0); __stfpd(&c[i+2],c2); } } } t1 = getticks(); dt2 = t1 - t0; // VERIFICATION for (i=0;i<dim;i++) { if (b[i] != c[i]) { printf("%4d %30.15f %30.15f\n",i,b[i],c[i]); } } fprintf(stderr,"INTRINSICS VERSION 3\n"); // WARM-UP for (i=0;i<dim;i++) c[i] = 0.0; for (i=0;i<dim;i+=8) { { double _Complex a0, a2, a4, a6; double _Complex c0, c2, c4, c6; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+2]); a4 = __lfpd(&a[i+4]); a6 = __lfpd(&a[i+6]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+2]); c4 = __lfpd(&c[i+4]); c6 = __lfpd(&c[i+6]); c0 = __fxcpmadd(c0,a0,scale); c2 = __fxcpmadd(c2,a2,scale); c4 = __fxcpmadd(c4,a4,scale); c6 = __fxcpmadd(c6,a6,scale); __stfpd(&c[i ],c0); __stfpd(&c[i+2],c2); __stfpd(&c[i+4],c4); __stfpd(&c[i+6],c6); } } // TIMING for (i=0;i<dim;i++) c[i] = 0.0; t0 = getticks(); for (j=0;j<count;j++) { for (i=0;i<dim;i+=8) { { double _Complex a0, a2, a4, a6; double _Complex c0, c2, c4, c6; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+2]); a4 = __lfpd(&a[i+4]); a6 = __lfpd(&a[i+6]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+2]); c4 = __lfpd(&c[i+4]); c6 = __lfpd(&c[i+6]); c0 = __fxcpmadd(c0,a0,scale); c2 = __fxcpmadd(c2,a2,scale); c4 = __fxcpmadd(c4,a4,scale); c6 = __fxcpmadd(c6,a6,scale); __stfpd(&c[i ],c0); __stfpd(&c[i+2],c2); __stfpd(&c[i+4],c4); __stfpd(&c[i+6],c6); } } } t1 = getticks(); dt3 = t1 - t0; // VERIFICATION for (i=0;i<dim;i++) { if (b[i] != c[i]) { printf("%4d %30.15f %30.15f\n",i,b[i],c[i]); } } fprintf(stderr,"INTRINSICS VERSION 4\n"); // WARM-UP for (i=0;i<dim;i++) c[i] = 0.0; for (i=0;i<dim;i+=16) { { double _Complex a0, a2, a4, a6, a8, a10, a12, a14; double _Complex c0, c2, c4, c6, c8, c10, c12, c14; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+ 2]); a4 = __lfpd(&a[i+ 4]); a6 = __lfpd(&a[i+ 6]); a4 = __lfpd(&a[i+ 8]); a6 = __lfpd(&a[i+10]); a4 = __lfpd(&a[i+12]); a6 = __lfpd(&a[i+14]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+ 2]); c4 = __lfpd(&c[i+ 4]); c6 = __lfpd(&c[i+ 6]); c4 = __lfpd(&c[i+ 8]); c6 = __lfpd(&c[i+10]); c4 = __lfpd(&c[i+12]); c6 = __lfpd(&c[i+14]); c0 = __fxcpmadd( c0, a0,scale); c2 = __fxcpmadd( c2, a2,scale); c4 = __fxcpmadd( c4, a4,scale); c6 = __fxcpmadd( c6, a6,scale); c4 = __fxcpmadd( c8, a8,scale); c6 = __fxcpmadd(c10,a10,scale); c4 = __fxcpmadd(c12,a12,scale); c6 = __fxcpmadd(c14,a14,scale); __stfpd(&c[i ],c0); __stfpd(&c[i+ 2],c2); __stfpd(&c[i+ 4],c4); __stfpd(&c[i+ 6],c6); __stfpd(&c[i+ 8],c4); __stfpd(&c[i+10],c6); __stfpd(&c[i+12],c4); __stfpd(&c[i+14],c6); } } // TIMING for (i=0;i<dim;i++) c[i] = 0.0; t0 = getticks(); for (j=0;j<count;j++) { for (i=0;i<dim;i+=16) { { double _Complex a0, a2, a4, a6, a8, a10, a12, a14; double _Complex c0, c2, c4, c6, c8, c10, c12, c14; a0 = __lfpd(&a[i ]); a2 = __lfpd(&a[i+ 2]); a4 = __lfpd(&a[i+ 4]); a6 = __lfpd(&a[i+ 6]); a8 = __lfpd(&a[i+ 8]); a10 = __lfpd(&a[i+10]); a12 = __lfpd(&a[i+12]); a14 = __lfpd(&a[i+14]); c0 = __lfpd(&c[i ]); c2 = __lfpd(&c[i+ 2]); c4 = __lfpd(&c[i+ 4]); c6 = __lfpd(&c[i+ 6]); c8 = __lfpd(&c[i+ 8]); c10 = __lfpd(&c[i+10]); c12 = __lfpd(&c[i+12]); c14 = __lfpd(&c[i+14]); c0 = __fxcpmadd( c0, a0,scale); c2 = __fxcpmadd( c2, a2,scale); c4 = __fxcpmadd( c4, a4,scale); c6 = __fxcpmadd( c6, a6,scale); c8 = __fxcpmadd( c8, a8,scale); c10 = __fxcpmadd(c10,a10,scale); c12 = __fxcpmadd(c12,a12,scale); c14 = __fxcpmadd(c14,a14,scale); __stfpd(&c[i ], c0); __stfpd(&c[i+ 2], c2); __stfpd(&c[i+ 4], c4); __stfpd(&c[i+ 6], c6); __stfpd(&c[i+ 8], c8); __stfpd(&c[i+10],c10); __stfpd(&c[i+12],c12); __stfpd(&c[i+14],c14); } } } t1 = getticks(); dt4 = t1 - t0; // VERIFICATION for (i=0;i<dim;i++) { if (b[i] != c[i]) { printf("%4d %30.15f %30.15f\n",i,b[i],c[i]); } } printf("%18d %18llu %18llu %18llu %18llu %18llu\n",dim,dt0,dt1,dt2,dt3,dt4); free(a); free(b); free(c); } fprintf(stderr,"ALL DONE\n"); return(0); }