示例#1
0
void diff(spinor * const Q,spinor * const R,spinor * const S, const int N)
{
  int ix = 1;
  double *s ALIGN;
  double *sp ALIGN;
  double *r ALIGN;
  double *rp ALIGN;
  double *q ALIGN;
  double _Complex x00, x01, x02, x03, x04, x05, x06, x07, 
    x08, x09, x10, x11;
  double _Complex y00, y01, y02, y03, y04, y05, y06, y07, 
    y08, y09, y10, y11;
#pragma disjoint(*R, *S)

  __alignx(16, Q);
  __alignx(16, R);
  __alignx(16, S);
  r = (double*) R;
  s = (double*) S;
  q = (double*) Q;
  rp = r + 24;
  sp = s + 24;
  _prefetch_spinor(rp);
  _prefetch_spinor(sp);
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));
  s = sp;
  r = rp;
  q+=24;
#pragma unroll(12)
  for(ix = 1; ix < N-1; ix++) {
    rp+=24;
    sp+=24;
    _prefetch_spinor(rp);
    _prefetch_spinor(sp);
    x00 = __lfpd(r);    
    x01 = __lfpd(r+2);  
    x02 = __lfpd(r+4);  
    x03 = __lfpd(r+6);  
    x04 = __lfpd(r+8);  
    x05 = __lfpd(r+10); 
    x06 = __lfpd(r+12); 
    x07 = __lfpd(r+14); 
    x08 = __lfpd(r+16); 
    x09 = __lfpd(r+18); 
    x10 = __lfpd(r+20); 
    x11 = __lfpd(r+22); 
    y00 = __lfpd(s);   
    y01 = __lfpd(s+2); 
    y02 = __lfpd(s+4); 
    y03 = __lfpd(s+6); 
    y04 = __lfpd(s+8); 
    y05 = __lfpd(s+10);
    y06 = __lfpd(s+12);
    y07 = __lfpd(s+14);
    y08 = __lfpd(s+16);
    y09 = __lfpd(s+18);
    y10 = __lfpd(s+20);
    y11 = __lfpd(s+22);
    
    __stfpd(q, __fpsub(x00, y00));
    __stfpd(q+2, __fpsub(x01, y01));
    __stfpd(q+4, __fpsub(x02, y02));
    __stfpd(q+6, __fpsub(x03, y03));
    __stfpd(q+8, __fpsub(x04, y04));
    __stfpd(q+10, __fpsub(x05, y05));
    __stfpd(q+12, __fpsub(x06, y06));
    __stfpd(q+14, __fpsub(x07, y07));
    __stfpd(q+16, __fpsub(x08, y08));
    __stfpd(q+18, __fpsub(x09, y09));
    __stfpd(q+20, __fpsub(x10, y10));
    __stfpd(q+22, __fpsub(x11, y11));
    s = sp;
    r = rp;
    q+=24;
  }
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));

  return;
}
示例#2
0
int main(int argc, char* argv[])
{
    fprintf(stderr,"BEGIN TESTING OF DP ACCUMULATE\n");
    printf("%18s %18s %18s %18s %18s %18s\n","dim","basic","1-hummer","2-hummers","4-hummers","8-hummers");

    int k;
    for (k=6;k<20;k++)
    {
        int dim = pow(2,k);

        int count = 10;

        int i,j;

        unsigned long long t0, t1;
        unsigned long long dt0, dt1, dt2, dt3, dt4;

        double* a;
        double* b;
        double* c;

        double  scale = 0.1;

        posix_memalign((void**)&a, 16*sizeof(double), dim*sizeof(double));
        posix_memalign((void**)&b, 16*sizeof(double), dim*sizeof(double));
        posix_memalign((void**)&c, 16*sizeof(double), dim*sizeof(double));

        for (i=0;i<dim;i++) a[i] = 1.0 - 2*(double)rand()/(double)RAND_MAX;

        fprintf(stderr,"BASIC VERSION\n");

        // WARM-UP
        for (i=0;i<dim;i++) b[i] = 0.0;
        for (i=0;i<dim;i++) b[i] += scale*a[i];

        // TIMING
        for (i=0;i<dim;i++) b[i] = 0.0;
        t0 = getticks();
        for (j=0;j<count;j++)
        {
            for (i=0;i<dim;i++) b[i] += scale*a[i];
        }
        t1 = getticks();
        dt0 = t1 - t0;

        fprintf(stderr,"INTRINSICS VERSION 1\n");

        // WARM-UP
        for (i=0;i<dim;i++) c[i] = 0.0;
        for (i=0;i<dim;i+=2)
        {
            __stfpd(&c[i], __fxcpmadd( __lfpd(&c[i]), __lfpd(&a[i]), scale) );
        }

        // TIMING
        for (i=0;i<dim;i++) c[i] = 0.0;
        t0 = getticks();
        for (j=0;j<count;j++)
        {
            for (i=0;i<dim;i+=2)
            {
                __stfpd(&c[i], __fxcpmadd( __lfpd(&c[i]), __lfpd(&a[i]), scale) );
            }
        }
        t1 = getticks();
        dt1 = t1 - t0;

        // VERIFICATION
        for (i=0;i<dim;i++)
        {
            if (b[i] != c[i])
            {
                printf("%4d %30.15f %30.15f\n",i,b[i],c[i]);
            }
        }

        fprintf(stderr,"INTRINSICS VERSION 2\n");

        // WARM-UP
        for (i=0;i<dim;i++) c[i] = 0.0;
        for (i=0;i<dim;i+=4)
        {
            {
                double _Complex a0, a2, c0, c2;
                a0 = __lfpd(&a[i  ]);
                a2 = __lfpd(&a[i+2]);
                c0 = __lfpd(&c[i  ]);
                c2 = __lfpd(&c[i+2]);
                c0 = __fxcpmadd(c0,a0,scale);
                c2 = __fxcpmadd(c2,a2,scale);
                __stfpd(&c[i  ],c0);
                __stfpd(&c[i+2],c2);
            }
        }

        // TIMING
        for (i=0;i<dim;i++) c[i] = 0.0;
        t0 = getticks();
        for (j=0;j<count;j++)
        {
            for (i=0;i<dim;i+=4)
            {
                {
                    double _Complex a0, a2, c0, c2;
                    a0 = __lfpd(&a[i  ]);
                    a2 = __lfpd(&a[i+2]);
                    c0 = __lfpd(&c[i  ]);
                    c2 = __lfpd(&c[i+2]);
                    c0 = __fxcpmadd(c0,a0,scale);
                    c2 = __fxcpmadd(c2,a2,scale);
                    __stfpd(&c[i  ],c0);
                    __stfpd(&c[i+2],c2);
                }
            }
        }
        t1 = getticks();
        dt2 = t1 - t0;

        // VERIFICATION
        for (i=0;i<dim;i++)
        {
            if (b[i] != c[i])
            {
                printf("%4d %30.15f %30.15f\n",i,b[i],c[i]);
            }
        }

        fprintf(stderr,"INTRINSICS VERSION 3\n");

        // WARM-UP
        for (i=0;i<dim;i++) c[i] = 0.0;
        for (i=0;i<dim;i+=8)
        {
            {
                double _Complex a0, a2, a4, a6;
                double _Complex c0, c2, c4, c6;
                a0 = __lfpd(&a[i  ]);
                a2 = __lfpd(&a[i+2]);
                a4 = __lfpd(&a[i+4]);
                a6 = __lfpd(&a[i+6]);
                c0 = __lfpd(&c[i  ]);
                c2 = __lfpd(&c[i+2]);
                c4 = __lfpd(&c[i+4]);
                c6 = __lfpd(&c[i+6]);
                c0 = __fxcpmadd(c0,a0,scale);
                c2 = __fxcpmadd(c2,a2,scale);
                c4 = __fxcpmadd(c4,a4,scale);
                c6 = __fxcpmadd(c6,a6,scale);
                __stfpd(&c[i  ],c0);
                __stfpd(&c[i+2],c2);
                __stfpd(&c[i+4],c4);
                __stfpd(&c[i+6],c6);
            }
        }

        // TIMING
        for (i=0;i<dim;i++) c[i] = 0.0;
        t0 = getticks();
        for (j=0;j<count;j++)
        {
            for (i=0;i<dim;i+=8)
            {
                {
                    double _Complex a0, a2, a4, a6;
                    double _Complex c0, c2, c4, c6;
                    a0 = __lfpd(&a[i  ]);
                    a2 = __lfpd(&a[i+2]);
                    a4 = __lfpd(&a[i+4]);
                    a6 = __lfpd(&a[i+6]);
                    c0 = __lfpd(&c[i  ]);
                    c2 = __lfpd(&c[i+2]);
                    c4 = __lfpd(&c[i+4]);
                    c6 = __lfpd(&c[i+6]);
                    c0 = __fxcpmadd(c0,a0,scale);
                    c2 = __fxcpmadd(c2,a2,scale);
                    c4 = __fxcpmadd(c4,a4,scale);
                    c6 = __fxcpmadd(c6,a6,scale);
                    __stfpd(&c[i  ],c0);
                    __stfpd(&c[i+2],c2);
                    __stfpd(&c[i+4],c4);
                    __stfpd(&c[i+6],c6);
                }
            }
        }
        t1 = getticks();
        dt3 = t1 - t0;

        // VERIFICATION
        for (i=0;i<dim;i++)
        {
            if (b[i] != c[i])
            {
                printf("%4d %30.15f %30.15f\n",i,b[i],c[i]);
            }
        }

        fprintf(stderr,"INTRINSICS VERSION 4\n");

        // WARM-UP
        for (i=0;i<dim;i++) c[i] = 0.0;
        for (i=0;i<dim;i+=16)
        {
            {
                double _Complex a0, a2, a4, a6, a8, a10, a12, a14;
                double _Complex c0, c2, c4, c6, c8, c10, c12, c14;
                a0 = __lfpd(&a[i   ]);
                a2 = __lfpd(&a[i+ 2]);
                a4 = __lfpd(&a[i+ 4]);
                a6 = __lfpd(&a[i+ 6]);
                a4 = __lfpd(&a[i+ 8]);
                a6 = __lfpd(&a[i+10]);
                a4 = __lfpd(&a[i+12]);
                a6 = __lfpd(&a[i+14]);
                c0 = __lfpd(&c[i   ]);
                c2 = __lfpd(&c[i+ 2]);
                c4 = __lfpd(&c[i+ 4]);
                c6 = __lfpd(&c[i+ 6]);
                c4 = __lfpd(&c[i+ 8]);
                c6 = __lfpd(&c[i+10]);
                c4 = __lfpd(&c[i+12]);
                c6 = __lfpd(&c[i+14]);
                c0 = __fxcpmadd( c0, a0,scale);
                c2 = __fxcpmadd( c2, a2,scale);
                c4 = __fxcpmadd( c4, a4,scale);
                c6 = __fxcpmadd( c6, a6,scale);
                c4 = __fxcpmadd( c8, a8,scale);
                c6 = __fxcpmadd(c10,a10,scale);
                c4 = __fxcpmadd(c12,a12,scale);
                c6 = __fxcpmadd(c14,a14,scale);
                __stfpd(&c[i   ],c0);
                __stfpd(&c[i+ 2],c2);
                __stfpd(&c[i+ 4],c4);
                __stfpd(&c[i+ 6],c6);
                __stfpd(&c[i+ 8],c4);
                __stfpd(&c[i+10],c6);
                __stfpd(&c[i+12],c4);
                __stfpd(&c[i+14],c6);
            }
        }

        // TIMING
        for (i=0;i<dim;i++) c[i] = 0.0;
        t0 = getticks();
        for (j=0;j<count;j++)
        {
            for (i=0;i<dim;i+=16)
            {
                {
                    double _Complex a0, a2, a4, a6, a8, a10, a12, a14;
                    double _Complex c0, c2, c4, c6, c8, c10, c12, c14;
                    a0  = __lfpd(&a[i   ]);
                    a2  = __lfpd(&a[i+ 2]);
                    a4  = __lfpd(&a[i+ 4]);
                    a6  = __lfpd(&a[i+ 6]);
                    a8  = __lfpd(&a[i+ 8]);
                    a10 = __lfpd(&a[i+10]);
                    a12 = __lfpd(&a[i+12]);
                    a14 = __lfpd(&a[i+14]);
                    c0  = __lfpd(&c[i   ]);
                    c2  = __lfpd(&c[i+ 2]);
                    c4  = __lfpd(&c[i+ 4]);
                    c6  = __lfpd(&c[i+ 6]);
                    c8  = __lfpd(&c[i+ 8]);
                    c10 = __lfpd(&c[i+10]);
                    c12 = __lfpd(&c[i+12]);
                    c14 = __lfpd(&c[i+14]);
                    c0  = __fxcpmadd( c0, a0,scale);
                    c2  = __fxcpmadd( c2, a2,scale);
                    c4  = __fxcpmadd( c4, a4,scale);
                    c6  = __fxcpmadd( c6, a6,scale);
                    c8  = __fxcpmadd( c8, a8,scale);
                    c10 = __fxcpmadd(c10,a10,scale);
                    c12 = __fxcpmadd(c12,a12,scale);
                    c14 = __fxcpmadd(c14,a14,scale);
                    __stfpd(&c[i   ], c0);
                    __stfpd(&c[i+ 2], c2);
                    __stfpd(&c[i+ 4], c4);
                    __stfpd(&c[i+ 6], c6);
                    __stfpd(&c[i+ 8], c8);
                    __stfpd(&c[i+10],c10);
                    __stfpd(&c[i+12],c12);
                    __stfpd(&c[i+14],c14);
                }
            }
        }
        t1 = getticks();
        dt4 = t1 - t0;

        // VERIFICATION
        for (i=0;i<dim;i++)
        {
            if (b[i] != c[i])
            {
                printf("%4d %30.15f %30.15f\n",i,b[i],c[i]);
            }
        }

        printf("%18d %18llu %18llu %18llu %18llu %18llu\n",dim,dt0,dt1,dt2,dt3,dt4);

        free(a);
        free(b);
        free(c);

    }

    fprintf(stderr,"ALL DONE\n");

    return(0);
}