C++ (Cpp) stop_watch示例

示例#1

0

显示文件

文件： space-time-soa.c 项目： g-koutsou/CoS-2

int
main(int argc, char *argv[])
{
  if(argc != 2) {
    usage(argv);
    return 1;
  }  
  int L = atoi(argv[1]);
  st_coords arr;
  arr.x = alloc(sizeof(float)*L);
  arr.y = alloc(sizeof(float)*L);
  arr.z = alloc(sizeof(float)*L);
  arr.t = alloc(sizeof(float)*L);
  arr.s = alloc(sizeof(float)*L);  
  for(int i=0; i<L; i++) {
    arr.x[i] = drand48();
    arr.y[i] = drand48();
    arr.z[i] = drand48();
    arr.t[i] = drand48()*C;
  }

  {
    /* Warm up */
    comp_s(arr, L);
    double t0acc = 0;
    double t1acc = 0;
    int n = 1;
    /* 
       Loop accumulating run-time. Stop when the average time has less
       than a 10% error
    */    
    while(1) {
      double t0 = stop_watch(0);
      for(int i=0; i<NREP; i++)
	comp_s(arr, L);
      t0 = stop_watch(t0)/(double)NREP;
      t0acc += t0;
      t1acc += t0*t0;
      if(n > 2) {
	double ave = t0acc/n;
	double err = sqrt(t1acc/n - ave*ave)/sqrt(n);
	if(err/ave < 0.1) {
	  t0acc = ave;
	  t1acc = err;
	  break;
	}
      }
      n++;
    }
    printf(" Done L = %d, in %3.1e +/- %3.1e secs, %g Mflop/s\n",
	   L, t0acc, t1acc, (double)9*L/1e6/t0acc);
  }
  
  free(arr.x);
  free(arr.y);
  free(arr.z);
  free(arr.t);
  free(arr.s);
  return 0;
}

示例#2

0

显示文件

文件： laplb.c 项目： g-koutsou/CoS-2

/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(latparams lp, field **x, field **b, link **g)
{
  size_t L = lp.L;
  int max_iter = 100;
  float tol = 1e-9;

  /* Temporary fields needed for CG */
  field **r = new_field(lp);
  field **p = new_field(lp);
  field **Ap = new_field(lp);

  /* Initial residual and p-vector */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  xeqy(lp, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(lp, r);  
  float bb = xdotx(lp, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(lp, Ap, p, g);
    t_lapl += stop_watch(t);
    float pAp = xdoty(lp, p, Ap);
    float alpha = rr/pAp;
    axpy(lp, alpha, p, x);
    axpy(lp, -alpha, Ap, r);
    float r1r1 = xdotx(lp, r);
    float beta = r1r1/rr;
    xpay(lp, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  rr = xdotx(lp, r);

  double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  del_field(r);
  del_field(p);
  del_field(Ap);
  return;
}

示例#3

0

显示文件

文件： lapl.c 项目： g-koutsou/LAP2015

/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u)
{
  int max_iter = 100;
  float tol = 1e-6;

  /* Temporary fields needed for CG */
  _Complex float *r = new_field(L);
  _Complex float *p = new_field(L);
  _Complex float *Ap = new_field(L);

  /* Initial residual and p-vector */
  lapl(L, r, x, u);
  xmy(L, b, r);
  xeqy(L, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(L, r);  
  float bb = xdotx(L, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(L, Ap, p, u);
    t_lapl += stop_watch(t);
    float pAp = xdoty(L, p, Ap);
    float alpha = rr/pAp;
    axpy(L, alpha, p, x);
    axpy(L, -alpha, Ap, r);
    float r1r1 = xdotx(L, r);
    float beta = r1r1/rr;
    xpay(L, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(L, r, x, u);
  xmy(L, b, r);
  rr = xdotx(L, r);

  double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9;
  double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  free(r);
  free(p);
  free(Ap);
  return;
}

示例#4

0

显示文件

文件： pftest.c 项目： 0xD34D/system_extras

int main()
{
    char *mem = malloc((N_PAGES+1) * 4096);
    intptr_t *p;
    int i;
    unsigned int j;

    /* Align to page start */
    mem = (char *) ((intptr_t) (mem + 4096) & ~0xfff);

    for (j = 0; j < sizeof(numPagesList)/sizeof(int); j++) {
        int numPages = numPagesList[j];
        int pageIdx = 0;
        int entryOffset = 0;

        /*
         * page 0      page 1      page 2     ....     page N  
         * ------      ------      ------              ------  
         * word 0   -> word 0   -> word 0 ->  ....  -> word 0 -> (page 0/word 0)
         *   :           :           :         :         :
         * word 1023   word 1023   word 1023   :       word 1023
         */
        for (i = 0; i < numPages; i++) {
            int nextPageIdx = (pageIdx + 1) % numPages;
            /* Looks like spread the pointer across cache lines introduce noise
             * to get to the asymptote
             * int nextEntryOffset = (entryOffset + 32) % 1024;
             */
            int nextEntryOffset = entryOffset;

            if (i != numPages -1) {
                *(intptr_t *) (mem + 4096 * pageIdx + entryOffset) = 
                    (intptr_t) (mem + 4096 * nextPageIdx + nextEntryOffset);
            } else {
                /* Last page - form the cycle */
                *(intptr_t *) (mem + 4096 * pageIdx + entryOffset) =
                    (intptr_t) &mem[0];
            }

            pageIdx = nextPageIdx;
            entryOffset = nextEntryOffset;
        }

        /* Starting point of the pointer chase */
        p = (intptr_t *) &mem[0];

        /* Warmup (ie pre-thrash the memory system */
        for (i = 0; i < WARMUP; i++) {
            p = (intptr_t *) *p;
        }

        /* Real work */
        unsigned long long t0 = stop_watch();
        for (i = 0; i < WORKLOAD; i++) {
            p = (intptr_t *) *p;
        }
        unsigned long long t1 = stop_watch();

        /* To keep p from being optimized by gcc */
        if (p) 
            printf("%d, %f\n", numPages, (float) (t1 - t0) / WORKLOAD);
    }
    return 0;
}

示例#5

0

显示文件

文件： main.c 项目： gpucw/cuda-lapl

int
main(int argc, char *argv[]) {
  /* Check the number of command line arguments */
  if(argc != 6) {
    usage(argv);
    exit(1);
  }
  /* The length of the array in x and y is read from the command
     line */
  Lx = atoi(argv[1]);
  Ly = atoi(argv[2]);
  /* The number of iterations */
  int niter = atoi(argv[3]);
  /* Fixed "sigma" */
  float sigma = 0.01;
  printf(" Ly,Lx = %d,%d\n", Ly, Lx);
  printf(" niter = %d\n", niter);
  printf(" input file = %s\n", argv[4]);
  printf(" output file = %s\n", argv[5]);
  /* Allocate the buffer for the data */
  float *arr = malloc(sizeof(float)*Lx*Ly);
  /* read file to buffer */
  read_from_file(arr, argv[4]);
  /* allocate super-site buffers */
  supersite *ssarr[2];
  posix_memalign((void**)&ssarr[0], 16, sizeof(supersite)*Lx*Ly/4);
  posix_memalign((void**)&ssarr[1], 16, sizeof(supersite)*Lx*Ly/4);
  /* convert input array to super-site packed */
  to_supersite(ssarr[0], arr);
  /* do iterations, record time */
  double t0 = stop_watch(0);
  for(int i=0; i<niter; i++) {
    lapl_iter_supersite(ssarr[(i+1)%2], sigma, ssarr[i%2]);
  }
  t0 = stop_watch(t0)/(double)niter;
  /* write the result after niter iteraions */
  char fname[256];
  /* construct filename */
  sprintf(fname, "%s.ss%08d", argv[5], niter);
  /* convert from super-site packed */
  from_supersite(arr, ssarr[niter%2]);
  /* write to file */
  write_to_file(fname, arr);
  /* write timing info */
  printf(" iters = %8d, (Lx,Ly) = %6d, %6d, t = %8.1f usec/iter, BW = %6.3f GB/s, P = %6.3f Gflop/s\n",
	 niter, Lx, Ly, t0*1e6, 
	 Lx*Ly*sizeof(float)*2.0/(t0*1.0e9), 
	 (Lx*Ly*6.0)/(t0*1.0e9));
  /* free super-site buffers */
  for(int i=0; i<2; i++) {
    free(ssarr[i]);
  }
  /*
   * GPU part
   */

  /* read file again for GPU run */
  read_from_file(arr, argv[4]);
  /* Fixed number of threads per block (in x- and y-direction), number
     of blocks per direction determined by dimensions Lx, Ly */
  int threads[] = {1, NTY, NTX};
  int blocks[] = {1, Ly/NTY, Lx/NTX};
  /* Initialize: allocate GPU arrays and load array to GPU */
  init_lapl_cuda(arr, sigma);
  /* Do iterations on GPU, record time */
  t0 = stop_watch(0);
  for(int i=0; i<niter; i++) {
    lapl_iter_cuda(blocks, threads);
  }
  t0 = stop_watch(t0)/(double)niter;
  /* construct filename for writing  */
  sprintf(fname, "%s.cu%08d", argv[5], niter);
  /* copy GPU array to main memory and free GPU arrays */
  fini_lapl_cuda(arr);
  /* write to file */
  write_to_file(fname, arr);
  /* write timing info */
  printf(" iters = %8d, (Lx,Ly) = %6d, %6d, t = %8.1f usec/iter, BW = %6.3f GB/s, P = %6.3f Gflop/s\n",
  	 niter, Lx, Ly, t0*1e6,
  	 Lx*Ly*sizeof(float)*2.0/(t0*1.0e9),
  	 (Lx*Ly*6.0)/(t0*1.0e9));
  /* free main memory array */
  free(arr);
  return 0;
}

示例#6

0

显示文件

文件： test_memcpy.c 项目： amabdelrehim/PLQCD

int main(int argc, char** argv){

   if(argc<2){
     printf("usage: %s array_size\n",argv[0]);
     exit(0);}

   int N=atoi(argv[1]);

   int i,j,k;

   __m128d register c1,c2,c3;

   _Complex double *A = (_Complex double *) amalloc(N*sizeof(_Complex double),16);
   _Complex double *B = (_Complex double *) amalloc(N*sizeof(_Complex double),16);


  



   double ts,tf, tsum;

   //some intialization
   for(i=0; i<N; i++){
      A[i]=0.1*i + I*10*i;}


   tsum =0.0;
   for(j=0; j<100; j++)
   { 
      ts=stop_watch(0.0);
      for(i=0; i<N; i++)
      {
         c1=_mm_load_pd((double *) &A[i]);
         _mm_store_pd((double *) &B[i],c1);
      }
      tf=stop_watch(ts);
      tsum += tf;
   }
   printf("SIMD copy time %f \n",tsum/100.00);

   tsum=0.0;
   for(j=0; j<100; j++)
   {
      ts=stop_watch(0.0);
      for(i=0; i<N; i++){
         B[i]=A[i];}
      tf=stop_watch(ts);
      tsum += tf;
   }
   printf("direct copy time %f\n",tsum/100.00);


   tsum =0.0;
   for(j=0; j<100; j++)
   { 
      ts=stop_watch(0.0);
      for(i=0; i<N; i++)
      {
         c1=_mm_load_pd((double *) &A[i]);
         _mm_store_pd((double *) &B[i],c1);
      }
      tf=stop_watch(ts);
      tsum += tf;
   }
   printf("SIMD copy time %f \n",tsum/100.00);

   tsum=0.0;
   for(j=0; j<100; j++)
   {
      ts=stop_watch(0.0);
      for(i=0; i<N; i++){
         B[i]=A[i];}
      tf=stop_watch(ts);
      tsum += tf;
   }
   printf("direct copy time %f\n",tsum/100.00);















   afree(A);
   afree(B);

return 0;
}

示例#7

0

显示文件

文件： test_hopping.c 项目： amabdelrehim/PLQCD

int main(int argc, char **argv)
{
    //initialize plqcd
    int init_status;

    if(argc < 3) {
        fprintf(stderr,"Error. Must pass the name of the input file and the number of multiplications to be performed \n");
        fprintf(stderr,"Usage: %s input_file_name Nmul\n",argv[0]);
        exit(1);
    }

    init_status = init_plqcd(argc,argv);

    if(init_status != 0)
        printf("Error initializing plqcd\n");

    int proc_id;
    int i,j,k,Nmul;
    proc_id = ipr(plqcd_g.cpr);

    Nmul=atoi(argv[2]);

#if 0
    //Intialize the ranlux random number generator
    start_ranlux(0,1);
#endif

    int NPROCS=plqcd_g.nprocs[0]*plqcd_g.nprocs[1]*plqcd_g.nprocs[2]*plqcd_g.nprocs[3];

    char ofname[128];

    char buff[128];

    strcpy(ofname,"test_hopping_output.procgrid.");

    sprintf(buff,"%d-%d-%d-%d.nthreads.%d.proc.%d",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3],plqcd_g.nthread,proc_id);



    strcat(ofname,buff);


    FILE *ofp;

    //FILE *ofp_source;

    //if(proc_id==0)
    //{
    //     ofp_source = fopen("test_rand_vals.out","w");
    //}

    if(proc_id==0)
    {
        ofp=fopen(ofname,"w");
        fprintf(ofp,"INPUT GLOBALS:\n");
        fprintf(ofp,"----------------\n");
        fprintf(ofp,"NPROC0 %d, NPROC1 %d, NPROC2 %d, NPROC3 %d, NTHREAD %d\n",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3], plqcd_g.nthread);
        fprintf(ofp,"L0 %d, L1 %d, L2 %d, L3 %d\n\n",plqcd_g.latdims[0],plqcd_g.latdims[1],plqcd_g.latdims[2],plqcd_g.latdims[3]);
        //printf("sizeof(spinor) %ld, sizeof(halfspinor) %ld, sizeof(su3) %ld \n",sizeof(spinor),sizeof(halfspinor),sizeof(su3));
    }


    int nthr;
#ifdef _OPENMP
    #pragma omp parallel
    {
        nthr=omp_get_num_threads();
        if(omp_get_thread_num() == 0)
            if(proc_id==0)
                fprintf(ofp,"Number of threads as returned by openmp %d\n",nthr);
    }
#endif


    /*****************************************************
     *Testing the Dirac operator interface
     ****************************************************/




    spinor *pin= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pin==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin.\n");
        exit(2);
    }

    spinor *pout= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pout==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout.\n");
        exit(2);
    }

    su3 *ufield= (su3 *) amalloc(4*plqcd_g.VOLUME*sizeof(su3), plqcd_g.ALIGN);
    if(ufield==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield.\n");
        exit(2);
    }


    //256 arrays
#ifdef AVX
    spinor_256 *pin_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pin_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_256.\n");
        exit(2);
    }


    spinor_256 *pout_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pout_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_256.\n");
        exit(2);
    }


    su3_256 *ufield_256= (su3_256 *) amalloc(4*plqcd_g.VOLUME/2*sizeof(su3_256), plqcd_g.ALIGN);

    if(ufield_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_256.\n");
        exit(2);
    }
#endif


    //512 arrays
#ifdef MIC
    spinor_512 *pin_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pin_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_512.\n");
        exit(2);
    }


    spinor_512 *pout_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pout_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_512.\n");
        exit(2);
    }


    su3_512 *ufield_512= (su3_512 *) amalloc(4*plqcd_g.VOLUME/4*sizeof(su3_512), plqcd_g.ALIGN);

    if(ufield_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_512.\n");
        exit(2);
    }
#endif





    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers



    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers
    double ru[18];
    double rs[24];

    for(i=0; i<plqcd_g.VOLUME; i++)
    {
        //ranlxd(rs,24);
        for(j=0; j<24; j++)
        {
            rs[j]= rand() / (double)RAND_MAX;
            //fprintf(stderr,"rs[%d]=%lf\n",j,rs[j]);
        }

        pin[i].s0.c0=rs[0]+I*rs[1];
        pin[i].s0.c1=rs[2]+I*rs[3];
        pin[i].s0.c2=rs[4]+I*rs[5];
        pin[i].s1.c0=rs[6]+I*rs[7];
        pin[i].s1.c1=rs[8]+I*rs[9];
        pin[i].s1.c2=rs[10]+I*rs[11];
        pin[i].s2.c0=rs[12]+I*rs[13];
        pin[i].s2.c1=rs[14]+I*rs[15];
        pin[i].s2.c2=rs[16]+I*rs[17];
        pin[i].s3.c0=rs[18]+I*rs[19];
        pin[i].s3.c1=rs[20]+I*rs[21];
        pin[i].s3.c2=rs[22]+I*rs[23];


        //ranlxd(rs,24);
        for(j=0; j<24; j++)
            rs[j]= rand() / (double)RAND_MAX;

        pout[i].s0.c0=rs[0]+I*rs[1];
        pout[i].s0.c1=rs[2]+I*rs[3];
        pout[i].s0.c2=rs[4]+I*rs[5];
        pout[i].s1.c0=rs[6]+I*rs[7];
        pout[i].s1.c1=rs[8]+I*rs[9];
        pout[i].s1.c2=rs[10]+I*rs[11];
        pout[i].s2.c0=rs[12]+I*rs[13];
        pout[i].s2.c1=rs[14]+I*rs[15];
        pout[i].s2.c2=rs[16]+I*rs[17];
        pout[i].s3.c0=rs[18]+I*rs[19];
        pout[i].s3.c1=rs[20]+I*rs[21];
        pout[i].s3.c2=rs[22]+I*rs[23];

        for(j=0; j<4; j++)
        {
            //ranlxd(ru,18);
            for(k=0; k<18; k++)
            {
                ru[k]= rand() / (double)RAND_MAX;
                //fprintf(stderr,"ru[%d]=%lf\n",k,ru[k]);
            }


            ufield[4*i+j].c00=ru[0]+I*ru[1];
            ufield[4*i+j].c01=ru[2]+I*ru[3];
            ufield[4*i+j].c02=ru[4]+I*ru[5];
            ufield[4*i+j].c10=ru[6]+I*ru[7];
            ufield[4*i+j].c11=ru[8]+I*ru[9];
            ufield[4*i+j].c12=ru[10]+I*ru[11];
            ufield[4*i+j].c20=ru[12]+I*ru[13];
            ufield[4*i+j].c21=ru[14]+I*ru[15];
            ufield[4*i+j].c22=ru[16]+I*ru[17];
        }

    }

#ifdef AVX
    for(i=0; i<plqcd_g.VOLUME; i +=2)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_256(ufield_256+4*i/2+j, ufield+4*i+j, ufield+4*(i+1)+j);

        copy_spinor_to_spinor_256(pin_256+i/2, pin+i, pin+i+1);
        copy_spinor_to_spinor_256(pout_256+i/2, pout+i, pout+i+1);
    }
#endif

#ifdef MIC
    for(i=0; i<plqcd_g.VOLUME; i +=4)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_512(ufield_512+4*i/4+j, ufield+4*i+j, ufield+4*(i+1)+j, ufield+4*(i+2)+j, ufield+4*(i+3)+j);

        copy_spinor_to_spinor_512(pin_512+i/4, pin+i, pin+i+1, pin+i+2, pin+i+3);
        copy_spinor_to_spinor_512(pout_512+i/4, pout+i, pout+i+1, pout+i+2, pout+i+3);
    }
#endif


    double total,t1=0.0,t2=0.0,mytotal;
    int  matvecs;


#ifdef ASSYMBLY
    //---------------------------------------------
    //1: non-blocking assymbly/c version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_assymbly(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_assymbly(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking assymbly/c version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef SSE3_INTRIN
    //---------------------------------------------
    //1: non-blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }



    //---------------------------------------------
    //2: blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin_blocking(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin_blocking(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef AVX
    //---------------------------------------------
    //2: avx version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
            t2=plqcd_hopping_matrix_oe_intrin_256(pin_256,pout_256,ufield_256);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"avxversion:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef MIC

#ifdef TEST_HOPPING_MIC
    //---------------------------------------------
    //3: MIC version full su3 matrix
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 3x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }


    //---------------------------------------------
    //3: MIC version full reduced su3 storage
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 2x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }

#endif

#ifdef TEST_SU3MUL_MIC

    matvecs=0;
    total=0.0;
    mytotal =0.0;

    //while(mytotal < 10)
    //{
    MPI_Barrier(MPI_COMM_WORLD);
    for(i=0; i<Nmul; i++)
    {
        t1=stop_watch(0.0);

#ifdef _OPENMP
        #pragma omp parallel
        {
#endif
            __m512d U[3][3], gin[3],gout[3];
            su3_512 *u0;
            su3_vector_512 *hin,*hout;
#ifdef _OPENMP
            #pragma omp for
#endif
            for(j=0; j< plqcd_g.VOLUME/4; j++)
            {
                u0  = &ufield_512[4*j];
                hin = &pin_512[j].s0;
                hout= &pout_512[j].s0;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

            }
#ifdef _OPENMP
        }
#endif

        t2 = stop_watch(t1);
        mytotal += t2;
    }
    matvecs += 4*Nmul*plqcd_g.VOLUME;
    //}

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);

    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"su3mul mic version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*66.0/total/1e+6);
    }
#endif

#endif //MIC

    finalize_plqcd();

    return 0;
}

示例#8

0

显示文件

文件： mxam.c 项目： g-koutsou/LAP2015

int
main(int argc, char *argv[])
{
  if(argc != 3) {
    usage(argv);
    exit(1);
  }

  char *e;
  int L = (int)strtoul(argv[1], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  int nreps = (int)strtoul(argv[2], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  double *x = alloc(sizeof(double)*L*N*N);
  double *y = alloc(sizeof(double)*L*N*N);
  double *a = alloc(sizeof(double)*N*N);

  randNxN(a);
  for(int i=0; i<L; i++)
    randNxN(&y[i]);
  for(int i=0; i<L; i++)
    randNxN(&x[i]);

  int nreps_inner = 2;
  double tave = 0;
  double tvar = 0;
  for(int k=0; ;k++) {
    tave = 0;
    tvar = 0;
    mulNxN(L, y, a, x);    
    for(int i=0; i<nreps; i++)
      {
	double t0 = stop_watch(0);
	for(int j=0; j<nreps_inner; j++)
	  mulNxN(L, y, a, x);
	t0 = stop_watch(t0)/nreps_inner;
	tave += t0;
	tvar += t0*t0;
      }
    tave /= (double)nreps;
    tvar /= (double)nreps;
    tvar = sqrt(tvar - tave*tave);
    if(tvar < tave/15)
      break;
    nreps_inner = nreps_inner*2;    
  }

  /*
    ___TODO_1___

    Print:
     1) Time per kernel call
     2) Susstained floating-point rate (GFlop/sec)
     3) Susstained bandwidth (GBytes/sec)
     Note: keep as function of N
   */
  free(x);
  free(y);  
  return 0;
}

示例#9

0

显示文件

文件： SceneLoader.cpp 项目： RichardOpenGL/KRender

bool SceneLoader::LoadFromFile(const char* file_name)
{
	KTimer stop_watch(true);
	FILE* pFile = NULL;
	bool ret = false;
	int ext = _FileExtension(file_name);

	std::string file_dir;
	GetPathDir(file_name, file_dir);
	Texture::TextureManager::GetInstance()->AddSearchPath(file_dir.c_str());

	if (!mpScene)
		mpScene = new KSceneSet;
	else
		mpScene->Reset();

	mIsFromOBJ = false;
	mIsSceneLoaded = false;
	KTimer fileReadingTime(true);
	// Perform the file reading
	if (ext == FILE_EXT_OBJ) {
		KRT_ObjFileLoader OBJLoader;
		OBJLoader.mUseTexMap = USE_TEX_MAP ? true : false;

		// Create scene
		UINT32 kd_idx = 0;
		KScene* pKDScene = mpScene->AddKDScene(kd_idx);
		mpScene->SceneNode_Create(kd_idx);

		if (OBJLoader.LoadObjFile(file_name, *pKDScene)) {
			ret = true;
			mIsFromOBJ = true;
		}
		else {
			mpScene->Reset();
			ret = false;
		}
	}
	else if (ext == FILE_EXT_ABC) {
		if (mAbcLoader.Load(file_name, *mpScene))
			ret = true;
		else {
			mpScene->Reset();
			ret = false;
		}
	}

	if (ret) {
		BuildNodeIdMap();
		mIsSceneLoaded = true;
	}

	mFileLoadingTime = UINT32(fileReadingTime.Stop() * 1000);

	// End of file reading, now build the acceleration structure
	mpAccelData = new KAccelStruct_BVH(mpScene);
	mpAccelData->SceneNode_BuildAccelData(NULL);

	KBBox scene_box = mpAccelData->GetSceneBBox();
	KVec3 center = scene_box.Center();
	float radius = nvmath::length(scene_box.mMax - scene_box.mMin) * 1.0f;
	
	CameraManager* pCameraMan = CameraManager::GetInstance();
	if (pCameraMan->GetCameraCnt() == 0 && mpScene) {
		// If there's no camera, create a default light regarding the bounding box of scene
		KCamera* pPinHoleCamera = pCameraMan->OpenCamera("__default", true);
		KCamera::MotionState ms;
		ms.pos = center + KVec3(0.5,0.5,0.5)*radius;
		ms.lookat = center;
		ms.up = KVec3(0, 1.0f, 0);
		ms.xfov = 45.0f;
		ms.focal = radius * 0.5f;
		pPinHoleCamera->SetupStillCamera(ms);
	}
	
	LightScheme* pLightScheme = LightScheme::GetInstance();
	if (pLightScheme->GetLightCount() == 0 && mpScene) {
		// If there's no light source, create a default one, otherwize the scene will be entirely dark.
		PointLightBase* pLight0 = dynamic_cast<PointLightBase*>(pLightScheme->CreateLightSource(POINT_LIGHT_TYPE));
		PointLightBase* pLight1 = dynamic_cast<PointLightBase*>(pLightScheme->CreateLightSource(POINT_LIGHT_TYPE));
		pLight0->SetIntensity(KColor(0.55f, 0.55f, 0.55f));
		pLight1->SetIntensity(KColor(0.55f, 0.55f, 0.55f));
		pLight0->SetPos(center + KVec3(0,1,1)*(radius*3.0f));
		pLight1->SetPos(center + KVec3(1,1,0)*(radius*3.0f));
	}

	if (pFile)
		fclose(pFile);

	mLoadingTime = stop_watch.Stop();

	return ret;
}

示例#10

0

显示文件

文件： main.cpp 项目： bangeneticalgorithms/septaNextBus

int main(int argc, char **argv)
{
    Arguments args;

    Settings::readSettings();

    args.readArguments(argc, argv);

    if(args.print_mini_help) args.printMiniHelp();
    if(args.print_help) args.printHelp();

    args.printSetup();

    //load gtfs databases to mysql
    load_gtfs(args.reload_gtfs);

    //collect stop data
    if(args.read_stop_data)
    {
        if(args.route_all)
        {
            //get all the routes
            string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/");
            vector<string> route_list;
            parse_bus_data(call_data, &route_list);

            //get and store stop data for all routes
            for(int i=0;i<route_list.size();i++)
                get_and_store_stop_data(route_list[i]);
        }
        else
        {
            //store stop data for just the specified route
            get_and_store_stop_data(args.route_id);
        }
    }

    //collect bus data
    if(args.read_bus_data)
    {
        MysqlDB *myobj = MysqlDB::getInstance();
        GoogDir *googdir = GoogDir::getInstance();
        map<int, BusObject> old_buses_cache;

        //setup google directions api
        googdir->to_front_end = false;
        googdir->setRouteID(args.route_id, args.route_all);

        for(int s=0;1;s++)
        {
            //let googdir manage itself
            googdir->doProcess();

            //read new weather if past 5 minutes or every 15 seconds check if there is a new update in the front end
            if((abs(time(0) - Weather::getInstance()->last_timestamp) > WEATHER_TIME_REGRAB + 30) 
                    || (s%15 && Weather::getInstance()->newRecentFrontEndWeatherAvailable()))
            {
                if(!Weather::getInstance()->getRecentWeather())
                    printf("Weather::getRecentWeather() failed!\n");
            }

            //read bus data every 5 seconds
            if(!(s%5))
            {
                stop_watch();

                //get all the routes
                string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/");
                vector<BusObject> bus_list = parse_bus_data(call_data);

                //needed for google directions
                googdir->makeRouteList(bus_list);

                //get and store stop data for all routes
                for(int i=0;i<bus_list.size();i++)
                {
                    BusObject &bus = bus_list[i];

                    if(!bus_is_new(bus, old_buses_cache)) continue;

                    //don't bother with buses 1 second + old
                    if(bus.offset) continue;

                    //this a route we care about?
                    if(!args.route_all && bus.route_id != args.route_id) continue;

                    //make sure weather and googdir timestamp are reasonable
                    if(!bus_weather_and_googdir_timestamp_good(bus)) continue;

                    myobj->insertBusData(bus_list[i]);
                }

                stop_watch("read_bus_data");
            }

            //pause and grab data again
            sleep(1);
        }
    }

    //create coefficients
    if(args.create_coeff_data)
    {
        MysqlDB *myobj = MysqlDB::getInstance();

        if(args.route_all)
        {
            //get all the routes
            string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/");
            vector<string> route_list = myobj->getBusDataRoutes();
            parse_bus_data(call_data);

            //create_coefficient_data for all routes
            for(int i=0;i<route_list.size();i++)
                create_coefficient_data(route_list[i], args.stop_all, args.stop_id, args.iterations, args.retrain_time);
        }
        else
        {
            //create_coefficient_data for just the specified route
            create_coefficient_data(args.route_id, args.stop_all, args.stop_id, args.iterations, args.retrain_time);
        }
    }

    //keep front end php scripts up to date with bus / weather data
    if(args.optimize_front_end)
    {
        MysqlDB *myobj = MysqlDB::getInstance();
        GoogDir *googdir = GoogDir::getInstance();
        PredictionCache *pcache = PredictionCache::getInstance();

        pcache->setRouteID(args.route_id, args.route_all);
        pcache->setStopID(args.stop_id, args.stop_all);

        //setup google directions api
        googdir->to_front_end = true;
        googdir->setRouteID(args.route_id, args.route_all);

        //clear out old bus predictions - 
        myobj->clearPredictions();

        for(int s=0;1;s++)
        {
            //let googdir manage itself
            googdir->doProcess();

            //read weather data every 5 minutes
            //if(!(s%(WEATHER_TIME_REGRAB)))
            if(abs(time(0) - Weather::getInstance()->last_timestamp) > WEATHER_TIME_REGRAB) 
            {
                WeatherObject w_obj;

                if(Weather::getInstance()->getRecentWeather(w_obj, false))
                {
                    pcache->setWeather(w_obj);

                    if(!myobj->insertWeatherFrontEnd(w_obj))
                        printf("Weather::getRecentWeather() failed!\n");
                }
                else
                    printf("Weather::getRecentWeather() failed!\n");
            }

            //read bus data every 5 seconds
            if(!(s%5))
            {
                stop_watch();

                //get all the routes
                string call_data = make_curl_call("http://www3.septa.org/hackathon/TransitViewAll/");
                vector<BusObject> bus_list = parse_bus_data(call_data);

                //needed for google directions
                googdir->makeRouteList(bus_list);

                //clear old stored bus data
                myobj->clearBusFrontEnd();

                //get and store stop data for all routes
                for(int i=0;i<bus_list.size();i++)
                {
                    BusObject &bus = bus_list[i];

                    if(args.route_all || bus.route_id == args.route_id)
                        myobj->insertBusData(bus, true);
                }

                stop_watch("read_bus_data");

                stop_watch();
                pcache->processBusList(bus_list);
                stop_watch("cache_bus_predictions");
            }

            //pause and grab data again
            sleep(1);
        }
    }

    return 0;
}

示例#11

0

显示文件

文件： axpy.c 项目： g-koutsou/LAP2015

/*
 * Main
 */
int
main(int argc, char *argv[])
{
  if(argc != 3) {
    usage(argv);
    exit(1);
  }

  char *e;
  int L = (int)strtoul(argv[1], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  int nreps = (int)strtoul(argv[2], &e, 10);
  if(*e != '\0') {
    usage(argv);
    exit(2);
  }

  _Complex float *x = alloc(sizeof(_Complex float)*L);
  _Complex float *y = alloc(sizeof(_Complex float)*L);
  _Complex float a;

  random_vec(L, x);
  random_vec(L, y);
  random_vec(1, &a);
  
  axpy(L, a, x, y);
  int nreps_inner = 2;
  double tave = 0;
  double tvar = 0;
  for(int k=0; ;k++) {
    tave = 0;
    tvar = 0;
    for(int i=0; i<nreps; i++)
      {
	double t0 = stop_watch(0);
	for(int j=0; j<nreps_inner; j++)
	  axpy(L, a, x, y);
	t0 = stop_watch(t0)/nreps_inner;
	tave += t0;
	tvar += t0*t0;
      }
    tave /= (double)nreps;
    tvar /= (double)nreps;
    tvar = sqrt(tvar - tave*tave);
    if(tvar < tave/25)
      break;
    nreps_inner = nreps_inner*2;    
  }

  /* 
     ___TODO_1___
     Print: 
     1) Time per kernel call with error (usec)
     2) Susstained floating-point rate (GFlop/sec)
     3) Susstained bandwidth (GBytes/sec)
  */
  double beta_fp = (8*L/tave)*1e-9;
  double beta_io = (8*3*L/tave)*1e-9;
  printf(" L = %12d, %4.2e ± %4.2e usec/call, perf. = %6.4e GFlop/sec, bw = %6.4e GBytes/sec\n",
	 L, tave*1e6, tvar*1e6, beta_fp, beta_io);
  
  free(x);
  free(y);  
  return 0;
}

示例#12

0

显示文件

文件： mm.c 项目： g-koutsou/CoS-2

int
main(int argc, char *argv[])
{
  int nargs = 3;  
  if(argc != nargs) {
    usage(argv);
    return 1;
  } 
  int M = atoi(argv[1]);
  int N = atoi(argv[2]);
  
  double *A = alloc(sizeof(double)*M*N);
  double *B = alloc(sizeof(double)*M*N);

  rand_mat(A, M, N);
  rand_mat(B, N, M);

  double *C = alloc(sizeof(double)*M*M);
  zero_mat(C, M, M);
  
  mat_mul(C, M, N, A, B);
  {
    double t0 = stop_watch(0);
    mat_mul(C, M, N, A, B);
    t0 = stop_watch(t0);
    double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */;
    printf(" ORIG: M = %d, N = %d,", M, N);
    printf(" took: %4.2e sec,", t0);
    printf(" P = %4.2e Mflop/s\n", beta_fp);
  }

#ifdef BLCK
  double *Cb = alloc(sizeof(double)*M*M);
  zero_mat(Cb, M, M);

  mat_mul_blocked(Cb, M, N, A, B);
  {
    double t0 = stop_watch(0);
    mat_mul_blocked(Cb, M, N, A, B);
    t0 = stop_watch(t0);
    double beta_fp = 0 /* _TODO_A_ calculate beta_fp from timing t0 */;    
    printf(" BLCK: M = %d, N = %d,", M, N);
    printf(" took: %4.2e sec,", t0);
    printf(" P = %4.2e Mflop/s, BM = %d, BN = %d\n", beta_fp, BM, BN);
  }
#endif
  
#ifdef BLCK
  double eps = 1e-12;
  double diff = 0;
  for(int i=0; i<M*M; i++) {
    diff += fabs((C[i] - Cb[i])/C[i]);
  }
  /*
   * If the difference between the flat and blocked result is larger
   * than eps, complain to stdout and write the two matrices to file
   * "diffs.out".
   */
  diff /= (double)M*M;
  if(diff > eps) {
    printf(" Non zero diff: %e\n", diff);
    FILE *fp = fopen("diffs.out", "w");
    for(int i=0; i<M*M; i++)
      fprintf(fp, "%e\n", fabs((C[i]-Cb[i])/C[i]));
    fclose(fp);
  }
#endif
  
  free(A);
  free(B);
  free(C);

#ifdef BLCK
  free(Cb);
#endif 
  return 0;
}

示例#13

0

显示文件

文件： k8s.cpp 项目： thesrinivas/rakshak

k8s::~k8s()
{
	stop_watch();
	cleanup();
}