Beispiel #1
0
int main(int argc, char **argv)
{
  // OP initialisation
  op_init(argc,argv,2);

  //MPI for user I/O
  int my_rank;
  int comm_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  int *pp;
  double *A, *r, *u, *du;

  int   nnode, nedge;

  /**------------------------BEGIN I/O and PARTITIONING ---------------------**/

  int g_nnode, g_nedge, g_n, g_e;

  g_nnode = (NN-1)*(NN-1);
  g_nedge = (NN-1)*(NN-1) + 4*(NN-1)*(NN-2);

  int *g_pp = 0;
  double *g_A = 0, *g_r = 0, *g_u = 0, *g_du = 0;

  op_printf("Global number of nodes, edges = %d, %d\n",g_nnode,g_nedge);

  if(my_rank == MPI_ROOT) {
    g_pp = (int *)malloc(sizeof(int)*2*g_nedge);

    g_A  = (double *)malloc(sizeof(double)*g_nedge);
    g_r  = (double *)malloc(sizeof(double)*g_nnode);
    g_u  = (double *)malloc(sizeof(double)*g_nnode);
    g_du = (double *)malloc(sizeof(double)*g_nnode);

    // create matrix and r.h.s., and set coordinates needed for renumbering / partitioning

    g_e = 0;

    for (int i=1; i<NN; i++) {
      for (int j=1; j<NN; j++) {
        g_n         = i-1 + (j-1)*(NN-1);
        g_r[g_n]      = 0.0f;
        g_u[g_n]      = 0.0f;
        g_du[g_n]     = 0.0f;

        g_pp[2*g_e]   = g_n;
        g_pp[2*g_e+1] = g_n;
        g_A[g_e]      = -1.0f;
        g_e++;

        for (int pass=0; pass<4; pass++) {
          int i2 = i;
          int j2 = j;
          if (pass==0) i2 += -1;
          if (pass==1) i2 +=  1;
          if (pass==2) j2 += -1;
          if (pass==3) j2 +=  1;

          if ( (i2==0) || (i2==NN) || (j2==0) || (j2==NN) ) {
            g_r[g_n] += 0.25f;
          }
          else {
            g_pp[2*g_e]   = g_n;
            g_pp[2*g_e+1] = i2-1 + (j2-1)*(NN-1);
            g_A[g_e]      = 0.25f;
            g_e++;
          }
        }
      }
    }
  }

  /* Compute local sizes */
  nnode = compute_local_size (g_nnode, comm_size, my_rank);
  nedge = compute_local_size (g_nedge, comm_size, my_rank);
  op_printf("Number of nodes, edges on process %d = %d, %d\n"
      ,my_rank,nnode,nedge);

  /*Allocate memory to hold local sets, mapping tables and data*/
  pp = (int *)malloc(2*sizeof(int)*nedge);

  A      = (double *) malloc(nedge*sizeof(double));
  r      = (double *) malloc(nnode*sizeof(double));
  u      = (double *) malloc(nnode*sizeof(double));
  du     = (double *) malloc(nnode*sizeof(double));

  /* scatter sets, mappings and data on sets*/
  scatter_int_array(g_pp, pp, comm_size, g_nedge,nedge, 2);
  scatter_double_array(g_A, A, comm_size, g_nedge,nedge, 1);
  scatter_double_array(g_r, r, comm_size, g_nnode,nnode, 1);
  scatter_double_array(g_u, u, comm_size, g_nnode,nnode, 1);
  scatter_double_array(g_du, du, comm_size, g_nnode,nnode, 1);

  /*Freeing memory allocated to gloabal arrays on rank 0
    after scattering to all processes*/
  if(my_rank == MPI_ROOT) {
    free(g_pp);
    free(g_A);
    free(g_r);
    free(g_u);
    free(g_du);
  }

  /**------------------------END I/O and PARTITIONING ---------------------**/

  // declare sets, pointers, and datasets

  op_set nodes = op_decl_set(nnode,"nodes");
  op_set edges = op_decl_set(nedge,"edges");

  op_map ppedge = op_decl_map(edges,nodes,2,pp, "ppedge");

  op_dat p_A = op_decl_dat(edges,1,"double", A,  "p_A" );
  op_dat p_r = op_decl_dat(nodes,1,"double", r,  "p_r" );
  op_dat p_u = op_decl_dat(nodes,1,"double", u,  "p_u" );
  op_dat p_du = op_decl_dat(nodes,1,"double", du,"p_du");

  alpha = 1.0f;
  op_decl_const(1,"double",&alpha);

  op_diagnostic_output();

  //trigger partitioning and halo creation routines
  op_partition("PTSCOTCH", "KWAY", NULL, NULL, NULL);

  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  // main iteration loop

  double u_sum, u_max, beta = 1.0f;

  for (int iter=0; iter<NITER; iter++) {
    op_par_loop(res,"res", edges,
        op_arg_dat(p_A,  -1,OP_ID,  1,"double", OP_READ),
        op_arg_dat(p_u,   1,ppedge, 1,"double", OP_READ),
        op_arg_dat(p_du,  0,ppedge, 1,"double", OP_INC),
        op_arg_gbl(&beta, 1,"double", OP_READ));

    u_sum = 0.0f;
    u_max = 0.0f;
    op_par_loop(update,"update", nodes,
        op_arg_dat(p_r,   -1,OP_ID, 1,"double",OP_READ),
        op_arg_dat(p_du,  -1,OP_ID, 1,"double",OP_RW),
        op_arg_dat(p_u,   -1,OP_ID, 1,"double",OP_INC),
        op_arg_gbl(&u_sum,1,"double",OP_INC),
        op_arg_gbl(&u_max,1,"double",OP_MAX));

    op_printf("\n u max/rms = %f %f \n\n",u_max, sqrt(u_sum/g_nnode));
  }

  op_timers(&cpu_t2, &wall_t2);

  //get results data array
  op_fetch_data(p_u, u);

  //output the result dat array to files
  op_print_dat_to_txtfile(p_u, "out_grid_mpi.dat"); //ASCI
  op_print_dat_to_binfile(p_u, "out_grid_mpi.bin"); //Binary

  printf("solution on rank %d\n", my_rank);
  for (int i = 0; i<nnode; i++) {
    printf(" %7.4f",u[i]);fflush(stdout);
  }
  printf("\n");

  //print each mpi process's timing info for each kernel
  op_timing_output();

  //print total time for niter interations
  op_printf("Max total runtime = %f\n",wall_t2-wall_t1);

  //gather results from all ranks and check
  double* ug = (double *)malloc(sizeof(double)*op_get_size(nodes));
  op_fetch_data_hdf5(p_u, ug, 0, op_get_size(nodes)-1);
  int result = check_result<double>(ug, NN, TOLERANCE);
  free(ug);

  op_exit();

  free(u);
  free(pp);
  free(A);
  free(r);
  free(du);

  return result;
}
Beispiel #2
0
int main(int argc, char **argv)
{
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  int *groups = (int *)malloc(size * sizeof(int));
  int *groups2 = (int *)malloc(size * sizeof(int));
  int my_type = 1; //This is to be read from a configuration file
  MPI_Allgather(&my_type, 1, MPI_INT, groups, 1, MPI_INT, MPI_COMM_WORLD);

  int num_groups = 0;
  for (int i = 0; i < size; i++) num_groups = num_groups > groups[i] ? num_groups : groups[i];
  num_groups++;

  //The global group
  MPI_Group global_grp;
  MPI_Comm_group(MPI_COMM_WORLD, &global_grp);

  //Create sub-groups and sub-communicators
  MPI_Group mpigroups[num_groups];
  MPI_Comm mpicomms[num_groups];
  int count = 0;
  for (int i = 0; i < num_groups; ++i) {
    count = 0;
    for (int j = 0; j < size; ++j) {
      if (groups[j] == i) {
        groups2[count++] = j;
      }
    }
    MPI_Group_incl(global_grp, count, groups2, &mpigroups[i]);
    MPI_Comm_create(MPI_COMM_WORLD, mpigroups[i], &mpicomms[i]);
  }

  //coupling procs
  for (int i = 0; i < 1; ++i) {
    count = 0;
    for (int j = 0; j < size; ++j) {
      if (groups[j] == i) {
        groups2[count++] = j;
      }
    }
  }

  // OP initialisation
  op_mpi_init(argc,argv,2,MPI_COMM_WORLD, mpicomms[1]);

  int    niter;
  double  rms;

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  // set constants and initialise flow field and residual
  op_printf("initialising flow field \n");

  char file[] = "new_grid.h5";

  // declare sets, pointers, datasets and global constants

  op_set nodes  = op_decl_set_hdf5(file, "nodes");
  op_set edges  = op_decl_set_hdf5(file,  "edges");
  op_set bedges = op_decl_set_hdf5(file, "bedges");
  op_set cells  = op_decl_set_hdf5(file,  "cells");

  op_map pedge   = op_decl_map_hdf5(edges, nodes, 2, file, "pedge");
  op_map pecell  = op_decl_map_hdf5(edges, cells,2, file, "pecell");
  op_map pbedge  = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge");
  op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell");
  op_map pcell   = op_decl_map_hdf5(cells, nodes,4, file, "pcell");
  op_map pbndbnd   = op_decl_map_hdf5(bedges, bedges,1, file, "pbndbnd");

  op_map m_test  = op_decl_map_hdf5(cells, nodes,4, file, "m_test");
  if (m_test == NULL) printf("m_test not found\n");

  op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int"  ,file,"p_bound");
  op_dat p_x     = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x");
  op_dat p_q     = op_decl_dat_hdf5(cells ,4,"double",file,"p_q");
  op_dat p_qold  = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold");
  op_dat p_adt   = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt");
  op_dat p_res   = op_decl_dat_hdf5(cells ,4,"double",file,"p_res");

  op_dat p_test  = op_decl_dat_hdf5(cells ,4,"double",file,"p_test");
  if (p_test == NULL) printf("p_test not found\n");

  op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5");
  op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5");
  op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5");
  op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5");
  op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5");
  op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5");
  op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5");

  op_decl_const(1,"double",&gam  );
  op_decl_const(1,"double",&gm1  );
  op_decl_const(1,"double",&cfl  );
  op_decl_const(1,"double",&eps  );
  op_decl_const(1,"double",&mach );
  op_decl_const(1,"double",&alpha);
  op_decl_const(4,"double",qinf  );

  op_diagnostic_output();

  //write back original data just to compare you read the file correctly
  //do an h5diff between new_grid_out.h5 and new_grid.h5 to
  //compare two hdf5 files
  op_dump_to_hdf5("new_grid_out.h5");

  op_write_const_hdf5("gam",1,"double",(char *)&gam,  "new_grid_out.h5");
  op_write_const_hdf5("gm1",1,"double",(char *)&gm1,  "new_grid_out.h5");
  op_write_const_hdf5("cfl",1,"double",(char *)&cfl,  "new_grid_out.h5");
  op_write_const_hdf5("eps",1,"double",(char *)&eps,  "new_grid_out.h5");
  op_write_const_hdf5("mach",1,"double",(char *)&mach,  "new_grid_out.h5");
  op_write_const_hdf5("alpha",1,"double",(char *)&alpha,  "new_grid_out.h5");
  op_write_const_hdf5("qinf",4,"double",(char *)qinf,  "new_grid_out.h5");

  //trigger partitioning and halo creation routines
  op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x);
  //op_partition("PARMETIS", "KWAY", edges, pecell, p_x);

  int g_ncell = op_get_size(cells);

  //create some temporaries so we can exchange data defined on the boundary
  double *ptr = NULL;
  op_dat center = op_decl_dat_temp(bedges, 3, "double", ptr, "center");
  op_dat pres = op_decl_dat_temp(bedges, 1, "double", ptr, "pres");

  int *ptr2 = NULL;
  op_dat p_bound2 = op_decl_dat_temp(bedges, 1, "int", ptr2, "p_bound2");
  op_dat center2 = op_decl_dat_temp(bedges, 3, "double", ptr, "center2");
  op_dat pres2 = op_decl_dat_temp(bedges, 1, "double", ptr, "pres2");

  //create import and export handles
  op_export_handle handle = op_export_init(count, groups2, pbndbnd);
  op_import_handle handle2 = op_import_init(count, groups2, center);

  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  // main time-marching loop

  niter = 1000;

  for(int iter=1; iter<=niter; iter++) {

    //  save old flow solution

    op_par_loop(save_soln,"save_soln", cells,
        op_arg_dat(p_q,   -1,OP_ID, 4,"double",OP_READ ),
        op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE));

    //  predictor/corrector update loop

    for(int k=0; k<2; k++) {

      //    calculate area/timstep

      op_par_loop(adt_calc,"adt_calc",cells,
          op_arg_dat(p_x,   0,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   1,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   2,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   3,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_q,  -1,OP_ID, 4,"double",OP_READ ),
          op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE));

      //    calculate flux residual

      op_par_loop(res_calc,"res_calc",edges,
          op_arg_dat(p_x,    0,pedge, 2,"double",OP_READ),
          op_arg_dat(p_x,    1,pedge, 2,"double",OP_READ),
          op_arg_dat(p_q,    0,pecell,4,"double",OP_READ),
          op_arg_dat(p_q,    1,pecell,4,"double",OP_READ),
          op_arg_dat(p_adt,  0,pecell,1,"double",OP_READ),
          op_arg_dat(p_adt,  1,pecell,1,"double",OP_READ),
          op_arg_dat(p_res,  0,pecell,4,"double",OP_INC ),
          op_arg_dat(p_res,  1,pecell,4,"double",OP_INC ));

      op_par_loop(bres_calc,"bres_calc",bedges,
          op_arg_dat(p_x,     0,pbedge, 2,"double",OP_READ),
          op_arg_dat(p_x,     1,pbedge, 2,"double",OP_READ),
          op_arg_dat(p_q,     0,pbecell,4,"double",OP_READ),
          op_arg_dat(p_adt,   0,pbecell,1,"double",OP_READ),
          op_arg_dat(p_res,   0,pbecell,4,"double",OP_INC ),
          op_arg_dat(p_bound,-1,OP_ID  ,1,"int",  OP_READ),
          op_arg_dat(center, -1, OP_ID, 3, "double", OP_WRITE),
          op_arg_dat(pres, -1, OP_ID, 1, "double", OP_WRITE));

      //    update flow field

      rms = 0.0;

      op_par_loop(update,"update",cells,
          op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ),
          op_arg_dat(p_q,   -1,OP_ID, 4,"double",OP_WRITE),
          op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW   ),
          op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ),
          op_arg_gbl(&rms,1,"double",OP_INC));
    }

    //  print iteration history

    rms = sqrt(rms/(double)g_ncell);

    if (iter%100 == 0) {
      op_printf(" %d  %10.5e \n",iter,rms);
      //Export data
      op_dat arr[] = {p_bound, center, pres};
      op_export_data(handle, 3, arr);
      //Import data
      op_dat arr2[] = {p_bound2, center2, pres2};
      op_import_data(handle2, 3, arr2);
      //check whether the two are the same
      op_par_loop(comparethem, "comparethem", bedges,
          op_arg_dat(p_bound,-1, OP_ID, 1, "int", OP_READ),
          op_arg_dat(p_bound2,-1, OP_ID, 1, "int", OP_READ),
          op_arg_dat(center,-1, OP_ID, 3, "double", OP_READ),
          op_arg_dat(center2,-1, OP_ID, 3, "double", OP_READ),
          op_arg_dat(pres,-1, OP_ID, 1, "double", OP_READ),
          op_arg_dat(pres2,-1, OP_ID, 1, "double", OP_READ));
    }
  }

  op_timers(&cpu_t2, &wall_t2);

  double* q = (double *)malloc(sizeof(double)*op_get_size(cells)*4);
  op_fetch_data_hdf5(p_q, q, 0, op_get_size(cells)-1);
  free(q);

  op_fetch_data_hdf5_file(p_q, "file_name.h5");

  //printf("Root process = %d\n",op_is_root());

  //output the result dat array to files
  //op_write_hdf5("new_grid_out.h5");

  //compress using
  // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5

  op_timing_output();
  op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1);
  op_exit();
}
int main(int argc, char **argv)
{
  // OP initialisation
  op_init(argc,argv,2);

  int    niter;
  double  rms;

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  // set constants and initialise flow field and residual
  op_printf("initialising flow field \n");

  char file[] = "new_grid.h5";

  // declare sets, pointers, datasets and global constants

  op_set nodes  = op_decl_set_hdf5(file, "nodes");
  op_set edges  = op_decl_set_hdf5(file,  "edges");
  op_set bedges = op_decl_set_hdf5(file, "bedges");
  op_set cells  = op_decl_set_hdf5(file,  "cells");

  op_map pedge   = op_decl_map_hdf5(edges, nodes, 2, file, "pedge");
  op_map pecell  = op_decl_map_hdf5(edges, cells,2, file, "pecell");
  op_map pbedge  = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge");
  op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell");
  op_map pcell   = op_decl_map_hdf5(cells, nodes,4, file, "pcell");

  op_map m_test  = op_decl_map_hdf5(cells, nodes,4, file, "m_test");
  if (m_test == NULL) printf("m_test not found\n");

  op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int"  ,file,"p_bound");
  op_dat p_x     = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x");
  op_dat p_q     = op_decl_dat_hdf5(cells ,4,"double",file,"p_q");
  op_dat p_qold  = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold");
  op_dat p_adt   = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt");
  op_dat p_res   = op_decl_dat_hdf5(cells ,4,"double",file,"p_res");

  op_dat p_test  = op_decl_dat_hdf5(cells ,4,"double",file,"p_test");
  if (p_test == NULL) printf("p_test not found\n");

  op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5");
  op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5");
  op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5");
  op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5");
  op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5");
  op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5");
  op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5");

  op_decl_const2("gam",1,"double",&gam);
  op_decl_const2("gm1",1,"double",&gm1);
  op_decl_const2("cfl",1,"double",&cfl);
  op_decl_const2("eps",1,"double",&eps);
  op_decl_const2("mach",1,"double",&mach);
  op_decl_const2("alpha",1,"double",&alpha);
  op_decl_const2("qinf",4,"double",qinf);

  op_diagnostic_output();

  //write back original data just to compare you read the file correctly
  //do an h5diff between new_grid_out.h5 and new_grid.h5 to
  //compare two hdf5 files
  op_dump_to_hdf5("new_grid_out.h5");

  op_write_const_hdf5("gam",1,"double",(char *)&gam,  "new_grid_out.h5");
  op_write_const_hdf5("gm1",1,"double",(char *)&gm1,  "new_grid_out.h5");
  op_write_const_hdf5("cfl",1,"double",(char *)&cfl,  "new_grid_out.h5");
  op_write_const_hdf5("eps",1,"double",(char *)&eps,  "new_grid_out.h5");
  op_write_const_hdf5("mach",1,"double",(char *)&mach,  "new_grid_out.h5");
  op_write_const_hdf5("alpha",1,"double",(char *)&alpha,  "new_grid_out.h5");
  op_write_const_hdf5("qinf",4,"double",(char *)qinf,  "new_grid_out.h5");

  //trigger partitioning and halo creation routines
  op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x);
  //op_partition("PARMETIS", "KWAY", edges, pecell, p_x);

  int g_ncell = op_get_size(cells);


  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  // main time-marching loop

  niter = 1000;

  for(int iter=1; iter<=niter; iter++) {

    //  save old flow solution

    op_par_loop_save_soln("save_soln",cells,
                op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE));

    //  predictor/corrector update loop

    for(int k=0; k<2; k++) {

      //    calculate area/timstep

      op_par_loop_adt_calc("adt_calc",cells,
                  op_arg_dat(p_x,0,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,2,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,3,pcell,2,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE));

      //    calculate flux residual

      op_par_loop_res_calc("res_calc",edges,
                  op_arg_dat(p_x,0,pedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pecell,4,"double",OP_READ),
                  op_arg_dat(p_q,1,pecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pecell,1,"double",OP_READ),
                  op_arg_dat(p_adt,1,pecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pecell,4,"double",OP_INC),
                  op_arg_dat(p_res,1,pecell,4,"double",OP_INC));

      op_par_loop_bres_calc("bres_calc",bedges,
                  op_arg_dat(p_x,0,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pbecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pbecell,4,"double",OP_INC),
                  op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ));

      //    update flow field

      rms = 0.0;

      op_par_loop_update("update",cells,
                  op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE),
                  op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ),
                  op_arg_gbl(&rms,1,"double",OP_INC));
    }

    //  print iteration history

    rms = sqrt(rms/(double)g_ncell);

    if (iter%100 == 0)
      op_printf(" %d  %10.5e \n",iter,rms);
  }

  op_timers(&cpu_t2, &wall_t2);

  double* q = (double *)malloc(sizeof(double)*op_get_size(cells)*4);
  op_fetch_data_hdf5(p_q, q, 0, op_get_size(cells)-1);
  free(q);

  op_fetch_data_hdf5_file(p_q, "file_name.h5");

  //printf("Root process = %d\n",op_is_root());

  //output the result dat array to files
  //op_write_hdf5("new_grid_out.h5");

  //compress using
  // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5

  op_timing_output();
  op_printf("Max total runtime = \n%f\n",wall_t2-wall_t1);
  op_exit();
}