Пример #1
0
int
main(int argc,char *argv[])
{
  int    i,j,k,nn;
  int    mx,my,mz,it;
  float  gosa;
  double cpu,cpu0,cpu1,flop,target;

  target= 60.0;
  omega= 0.8;
  mx= MX0-1;
  my= MY0-1;
  mz= MZ0-1;
  ndx= NDX0;
  ndy= NDY0;
  ndz= NDZ0;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &npe);
  MPI_Comm_rank(MPI_COMM_WORLD, &id);

  int    namelen;
  char   processor_name[MPI_MAX_PROCESSOR_NAME];
  MPI_Get_processor_name(processor_name,&namelen);
  fprintf(stderr, "[%d] %s\n", id, processor_name);

  initcomm(ndx,ndy,ndz);
  it= initmax(mx,my,mz);

  /*
   *    Initializing matrixes
   */
  initmt(mx,it);

  float *sendp2_buf = (float*)malloc(MIMAX*MKMAX*sizeof(float)*4);
  sendp2_lo_sendbuf = &sendp2_buf[MIMAX*MKMAX*0];
  sendp2_lo_recvbuf = &sendp2_buf[MIMAX*MKMAX*1];
  sendp2_hi_sendbuf = &sendp2_buf[MIMAX*MKMAX*2];
  sendp2_hi_recvbuf = &sendp2_buf[MIMAX*MKMAX*3];
#pragma acc enter data create(sendp2_buf[0:MIMAX*MKMAX*4])

  if(id==0){
    printf("Sequential version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0);
    printf("Parallel version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX);
    printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);
    printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz);
  }

  nn= 3;
  if(id==0){
    printf(" Start rehearsal measurement process.\n");
    printf(" Measure the performance in %d times.\n\n",nn);
  }

#pragma acc data copyin(p, bnd, wrk1, wrk2, a, b, c) present(sendp2_buf[0:MIMAX*MKMAX*4])
  {
  MPI_Barrier(MPI_COMM_WORLD);
  cpu0= gettime();
  gosa= jacobi(nn);
  cpu1= gettime();
  cpu = cpu1 - cpu0;

  MPI_Allreduce(MPI_IN_PLACE,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  flop= fflop(mz,my,mx);

  if(id == 0){
    printf(" MFLOPS: %f time(s): %f %e\n\n",
           mflops(nn,cpu,flop),cpu,gosa);
  }

  nn= (int)(target/(cpu/3.0));
  nn= LOOP_TIMES;
  halo_time = 0.0;
  if(id == 0){
    printf(" Now, start the actual measurement process.\n");
    printf(" The loop will be excuted in %d times\n",nn);
    printf(" This will take about one minute.\n");
    printf(" Wait for a while\n\n");
  }

  /*
   *    Start measuring
   */
  MPI_Barrier(MPI_COMM_WORLD);
  cpu0= gettime();
  gosa= jacobi(nn);
  cpu1= gettime();
  cpu = cpu1 - cpu0;

  MPI_Allreduce(MPI_IN_PLACE,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  MPI_Allreduce(&halo_time,
                &max_halo_time,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  MPI_Allreduce(&halo_time,
                &ave_halo_time,
                1,
                MPI_DOUBLE,
                MPI_SUM,
                MPI_COMM_WORLD);
  ave_halo_time /= npe;
  }//end of acc data

  if(id == 0){
    printf("cpu : %f sec. halo(AVE.) %f sec. halo(MAX) %f sec.\n", cpu, ave_halo_time, max_halo_time);
    printf("Loop executed for %d times\n",nn);
    printf("Gosa : %e \n",gosa);
    printf("MFLOPS measured : %f\n",mflops(nn,cpu,flop));
    printf("Score based on Pentium III 600MHz : %f\n",
           mflops(nn,cpu,flop)/82.84);
  }

  free(sendp2_buf);

  MPI_Finalize();

  return (0);
}
Пример #2
0
int
main(int argc,char *argv[])
{
  int    i,j,k,nn;
  int    mx,my,mz,it;
  float  gosa;
  double cpu,cpu0,cpu1,flop,target;

  target= 60.0;
  omega= 0.8;
  mx= MX0-1;
  my= MY0-1;
  mz= MZ0-1;
  ndx= NDX0;
  ndy= NDY0;
  ndz= NDZ0;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &npe);
  MPI_Comm_rank(MPI_COMM_WORLD, &id);

  hime_err_init(id);
  if (argc != 3) {
    if (id == 0) {
      printf("./bmt <Restart #> <Checkpoint interval (steps)>\n");
      printf("\n");
      printf("   Restart #:\n");
      printf("      Checkpiont id at which bmt starts\n");
      printf("   Checkpoint interval (steps):\n");
      printf("      # of Steps to skip checkpointing\n");
      printf("");
    }
    MPI_Finalize();
    exit(0);
  }
  
  restart_id = atoi(argv[1]);
  interval   = atoi(argv[2]);

  hime_dbgi(0, "Checkpoint directory: %s", CHECKPOINT_DIR);
  hime_dbgi(0, "Checkpoint interval:  %d", interval);

  if (restart_id > 0) {
    hime_dbgi(0, "Restart ID:  %d", restart_id);
    restart(restart_id);
  }  

  initcomm(ndx,ndy,ndz);
  it= initmax(mx,my,mz);

  /*
   *    Initializing matrixes
   */
  initmt(mx,it);

  if(id==0){
    printf("Sequential version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0);
    printf("Parallel version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX);
    printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);
    printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz);
  }

  nn= 3;
  if(id==0){
    printf(" Start rehearsal measurement process.\n");
    printf(" Measure the performance in %d times.\n\n",nn);
  }

  MPI_Barrier(MPI_COMM_WORLD);
  cpu0= MPI_Wtime();
  gosa= jacobi(nn);
  cpu1= MPI_Wtime() - cpu0;

  MPI_Allreduce(&cpu1,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  flop= fflop(mz,my,mx);
  if(id == 0){
    printf(" MFLOPS: %f time(s): %f %e\n\n",
           mflops(nn,cpu,flop),cpu,gosa);
  }
  nn= (int)(target/(cpu/3.0));

  if(id == 0){
    printf(" Now, start the actual measurement process.\n");
    printf(" The loop will be excuted in %d times\n",nn);
    printf(" This will take about one minute.\n");
    printf(" Wait for a while\n\n");
  }

  /*
   *    Start measuring
   */
  MPI_Barrier(MPI_COMM_WORLD);
  cpu0 = MPI_Wtime();
  gosa = jacobi(nn);
  cpu1 = MPI_Wtime() - cpu0;

  MPI_Allreduce(&cpu1,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  if(id == 0){
    printf("cpu : %f sec.\n", cpu);
    printf("Loop executed for %d times\n",nn);
    printf("Gosa : %e \n",gosa);
    printf("MFLOPS measured : %f\n",mflops(nn,cpu,flop));
    printf("Score based on Pentium III 600MHz : %f\n",
           mflops(nn,cpu,flop)/82.84);
  }


  MPI_Finalize();
  
  return (0);
}
Пример #3
0
int
main(int argc,char *argv[])
{
  int    i,j,k,nn;
  int    mx,my,mz,it;
  float  gosa;
  double cpu,cpu0,cpu1,flop,target;

  target= 60.0;
  omega= 0.8;
  mx= MX0-1;
  my= MY0-1;
  mz= MZ0-1;
  ndx= NDX0;
  ndy= NDY0;
  ndz= NDZ0;

  MPI_Init(&argc, &argv);
#ifdef SCR_ENABLE
  SCR_Init();
#endif
  
  MPI_Comm_size(MPI_COMM_WORLD, &npe);
  MPI_Comm_rank(MPI_COMM_WORLD, &id);

  initcomm(ndx,ndy,ndz);
  it= initmax(mx,my,mz);

  /*
   *    Initializing matrixes
   */
  initmt(mx,it);

  if(id==0){
    printf("Sequential version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0);
    printf("Parallel version array size\n");
    printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX);
    printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);
    printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz);
  }

  nn= 3;
  if(id==0){
    printf(" Start rehearsal measurement process.\n");
    printf(" Measure the performance in %d times.\n\n",nn);
  }

  MPI_Barrier(MPI_COMM_WORLD);
  cpu0= MPI_Wtime();
  gosa= jacobi(nn);
  cpu1= MPI_Wtime() - cpu0;

  MPI_Allreduce(&cpu1,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  flop= fflop(mz,my,mx);
  if(id == 0){
    printf(" MFLOPS: %f time(s): %f %e\n\n",
           mflops(nn,cpu,flop),cpu,gosa);
  }
 nn= (int)(target/(cpu/3.0));

  if(id == 0){
    printf(" Now, start the actual measurement process.\n");
    printf(" The loop will be excuted in %d times\n",nn);
    printf(" This will take about one minute.\n");
    printf(" Wait for a while\n\n");
  }

  /*
   *    Start measuring
   */
  MPI_Barrier(MPI_COMM_WORLD);
  cpu0 = MPI_Wtime();
  //  nn = 10000000;
  gosa = jacobi(nn);
  cpu1 = MPI_Wtime() - cpu0;

  MPI_Allreduce(&cpu1,
                &cpu,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                MPI_COMM_WORLD);

  if(id == 0){
    fprintf(stderr, "cpu : %f sec.\n", cpu);
    fprintf(stderr, "Loop executed for %d times\n",nn);
    fprintf(stderr, "Gosa : %e \n",gosa);
    fprintf(stderr, "GFLOPS measured : %f\n",mflops(nn,cpu,flop)/1000.0);
    fprintf(stderr, "Score based on Pentium III 600MHz : %f\n",
           mflops(nn,cpu,flop)/82.84);
  }

#ifdef SCR_ENABLE  
  SCR_Finalize();
#endif
  MPI_Finalize();
  
  return (0);
}
Пример #4
0
int main( int argc, char*argv[] )
{
  float *p_old,*p_new,*p_tmp;
  int    n,nn;
  float  gosa,gflops,thruput,thruput2;
  double time_start,time_max,target,bytes;
  cudaStream_t stream_top,stream_btm;

  NP=1;
  gpu=0;
  ME=0;

  target= 60.0;
  omega= 0.8f;
  imax = MIMAX-1;
  jmax = MJMAX-1;
  kmax = MKMAX-1;
  imax_global = NP*(imax-2)+2;
  nn = ITERS;

  if(ME==0)
  {
    printf("\n mimax = %d mjmax = %d mkmax = %d pitch = %d\n",MIMAX, MJMAX, MKMAX, PITCH);
    printf(" imax = %d jmax = %d kmax = %d\n",imax_global,jmax,kmax);
    printf(" gridX = %d  gridY = %d  blockX = %d  blockY = %d\n", GRID_X, GRID_Y, BLOCK_X, BLOCK_Y);
  }
  //printf("There are %d processes, I am process# %d using GPU %d\n",NP,ME,gpu);
  
  CUDA_SAFE_CALL(cudaSetDevice(gpu));
  stream_top = 0; 
  stream_btm = 0;

#if (CUDART_VERSION >= 3000)
  {
#if (CUDART_VERSION > 3000)
      struct cudaDeviceProp prop;
      // display ECC configuration, only queryable post r3.0
      CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, gpu));
      printf (" ECC on GPU %d is %s\n", gpu, prop.ECCEnabled ? "ON" : "OFF");
#endif /* CUDART_VERSION > 3000 */
      // configure kernels for large shared memory to get better occupancy
      printf (" Configuring GPU L1 cache size ...\n");
      set_kernel_cache_config (cudaFuncCachePreferShared);
  }
#endif /* CUDART_VERSION >= 3000 */

  CUDA_SAFE_CALL(cudaStreamCreate(&stream_top));
  CUDA_SAFE_CALL(cudaStreamCreate(&stream_btm));

  if(ME==0) printf(" Allocating Memory...\n");
  allocate_memory();
  if(ME==0) printf(" Initializing Data...\n\n");
  initmt();
  
  if(ME==0)
  {
    printf(" Now, start GPU measurement process.\n");
    printf(" The loop will be excuted %d times\n",nn);
    printf(" Wait for a while\n\n");
  }

  time_start = wallclock();

  gosa = 0.0f;
  p_new = p2_d; p_old = p1_d;
  
  for(n=0 ; n<nn; n++)
  {
    //swap pointers
    p_tmp = p_new; p_new = p_old; p_old = p_tmp;
    jacobi_GPU_btm_even (stream_btm,a0_d,a1_d,a2_d,a3_d,b0_d,b1_d,b2_d,c0_d,
                         c1_d,c2_d,wrk_d,bnd_d,p_old,p_new,gosa_d,omega,n);

    cudaMemcpyAsync (gosa_btm, gosa_d, sizeof(float), cudaMemcpyDeviceToHost,
                     stream_btm);
    // Since we want to print intermediate values of gosa every PRINT_ITER
    // iterations, we need to synchronize before picking up the asynchronously 
    // updated value.
    if (!(n % PRINT_ITER)) {
        cudaStreamSynchronize(stream_btm);
        gosa = *gosa_btm;
    }
    if(ME==0 && n%PRINT_ITER==0) printf(" iter: %d \tgosa: %e\n",n,gosa);
  }

  cudaThreadSynchronize();
  gosa = *gosa_btm;
  time_max = wallclock() - time_start;

  gflops   = (float)(34.0*( (double)nn*(double)(imax_global-2)*(double)(jmax-2)*(double)(kmax-2) ) / time_max * 1e-9);
  bytes    = NP*((double)nn*(56.0*(imax-2)+8.0)*(double)(jmax)*(double)(kmax));
  thruput  = (float)(bytes / time_max / 1024.0 / 1024.0 / 1024.0);
  thruput2 = (float)(bytes / time_max / 1e9);

  if(ME==0)
  {
    printf(" \nLoop executed for %d times\n",nn);
    printf(" Gosa : %e \n",gosa);
    printf(" total Compute   : %4.1f GFLOPS\ttime : %f seconds\n",gflops,time_max);
    printf(" total Bandwidth : %4.1f GB/s\n", thruput);
    printf(" total Bandwidth : %4.1f GB/s (STREAM equivalent)\n",thruput2);
    printf(" Score based on Pentium III 600MHz : %f\n\n",1000.0*gflops/82.0);
  }
  cleanup();

  CUDA_SAFE_CALL(cudaStreamDestroy(stream_top));
  CUDA_SAFE_CALL(cudaStreamDestroy(stream_btm));

  //check_results();
  return (EXIT_SUCCESS);
}
Пример #5
0
int
main()
{
  int    i, j, k, nn;
  float  gosa;
  double cpu, cpu0, cpu1, flop, target;

  int    myrank = xmp_node_num() - 1;

  target = 60.0;
  omega = 0.8;
  imax = MIMAX;
  jmax = MJMAX;
  kmax = MKMAX;

  /*
   *    Initializing matrixes
   */
  initmt();

  if (myrank == 0) {
    printf("mimax = %d mjmax = %d mkmax = %d\n",MIMAX, MJMAX, MKMAX);
    printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax);
  }

  nn= 3;

  if (myrank == 0) {
    printf(" Start rehearsal measurement process.\n");
    printf(" Measure the performance in %d times.\n\n",nn);
  }

  cpu0= xmp_wtime();
  gosa= jacobi(nn);
  cpu1= xmp_wtime();
  cpu= cpu1 - cpu0;

  flop= fflop(imax,jmax,kmax);

  if (myrank == 0) {
    printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa);
  }

  nn= (int)(target/(cpu/3.0));
#pragma xmp reduction (max:nn)

  if (myrank == 0) {
    printf(" Now, start the actual measurement process.\n");
    printf(" The loop will be excuted in %d times\n",nn);
    printf(" This will take about one minute.\n");
    printf(" Wait for a while\n\n");
  }

  /*
   *    Start measuring
   */
  cpu0 = xmp_wtime();
  gosa = jacobi(nn);
  cpu1 = xmp_wtime();

  cpu= cpu1 - cpu0;
  
  if (myrank == 0) {
    printf(" Loop executed for %d times\n",nn);
    printf(" Gosa : %e \n",gosa);
    printf(" MFLOPS measured : %f\tcpu : %f\n",mflops(nn,cpu,flop),cpu);
    printf(" Score based on Pentium III 600MHz : %f\n",
	   mflops(nn,cpu,flop)/82.84);
  }
  
  return (0);
}