Ejemplo n.º 1
0
static int get_solaris_eeprom_parameter(char *parameter,char *outbuffer) {

    int fd=0,status=0;
    struct openpromio *openprominfo=NULL;

    fd=open("/dev/openprom",O_RDONLY);
    if ( fd == -1 ) {
        snmp_log(LOG_ERR,"cannot open /dev/openprom\n");
        return 1;
    }

    openprominfo=(struct openpromio *)op_malloc(8192);
    if(!openprominfo) return 1;

    strcpy(openprominfo->oprom_array,parameter);

    status=ioctl(fd,OPROMGETOPT,openprominfo);
    if ( status == -1 ) {
        snmp_log(LOG_ERR,"cannot read from /dev/openprom\n");
        close(fd);
        op_free(openprominfo);
        return 1;
    }
    strcpy(outbuffer,openprominfo->oprom_array);

    op_free(openprominfo);

    /* close file */
    close(fd);

    return(0);
}
Ejemplo n.º 2
0
static void scatter_int_array(int *g_array, int *l_array, int comm_size,
                              int g_size, int l_size, int elem_size) {
  int *sendcnts = (int *)op_malloc(comm_size * sizeof(int));
  int *displs = (int *)op_malloc(comm_size * sizeof(int));
  int disp = 0;

  for (int i = 0; i < comm_size; i++) {
    sendcnts[i] = elem_size * compute_local_size(g_size, comm_size, i);
  }
  for (int i = 0; i < comm_size; i++) {
    displs[i] = disp;
    disp = disp + sendcnts[i];
  }

  MPI_Scatterv(g_array, sendcnts, displs, MPI_INT, l_array, l_size * elem_size,
               MPI_INT, MPI_ROOT, MPI_COMM_WORLD);

  free(sendcnts);
  free(displs);
}
Ejemplo n.º 3
0
int RepList::add(char * pat1, char * pat2) {
    if (pos >= size || pat1 == NULL || pat2 == NULL) return 1;
    replentry * r = (replentry *) op_malloc(sizeof(replentry));
    if (r == NULL) return 1;
    r->pattern = mystrrep(pat1, "_", " ");
    r->pattern2 = mystrrep(pat2, "_", " ");
    r->start = false;
    r->end = false;
    dat[pos++] = r;
    for (int i = pos - 1; i > 0; i--) {
      r = dat[i];
      if (op_strcmp(r->pattern, dat[i - 1]->pattern) < 0) {
          dat[i] = dat[i - 1];
          dat[i - 1] = r;
      } else break;
    }
    return 0;
}
Ejemplo n.º 4
0
int main(int argc, char **argv)
{
  // OP initialisation
  op_init(argc,argv,2);

  //MPI for user I/O
  int my_rank;
  int comm_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  int    *becell, *ecell,  *bound, *bedge, *edge, *cell;
  double  *x, *q, *qold, *adt, *res;

  int    nnode,ncell,nedge,nbedge,niter;
  double  rms;

  /**------------------------BEGIN I/O and PARTITIONING -------------------**/

  op_timers(&cpu_t1, &wall_t1);

  /* read in grid from disk on root processor */
  FILE *fp;

  if ( (fp = fopen("new_grid.dat","r")) == NULL) {
    op_printf("can't open file new_grid.dat\n"); exit(-1);
  }

  int   g_nnode,g_ncell,g_nedge,g_nbedge;

  check_scan(fscanf(fp,"%d %d %d %d \n",&g_nnode, &g_ncell, &g_nedge, &g_nbedge), 4);

  int *g_becell = 0, *g_ecell = 0, *g_bound = 0, *g_bedge = 0, *g_edge = 0, *g_cell = 0;
  double *g_x = 0,*g_q = 0, *g_qold = 0, *g_adt = 0, *g_res = 0;

  // set constants

  op_printf("initialising flow field\n");
  gam = 1.4f;
  gm1 = gam - 1.0f;
  cfl = 0.9f;
  eps = 0.05f;

  double mach  = 0.4f;
  double alpha = 3.0f*atan(1.0f)/45.0f;
  double p     = 1.0f;
  double r     = 1.0f;
  double u     = sqrt(gam*p/r)*mach;
  double e     = p/(r*gm1) + 0.5f*u*u;

  qinf[0] = r;
  qinf[1] = r*u;
  qinf[2] = 0.0f;
  qinf[3] = r*e;

  op_printf("reading in grid \n");
  op_printf("Global number of nodes, cells, edges, bedges = %d, %d, %d, %d\n"
      ,g_nnode,g_ncell,g_nedge,g_nbedge);

  if(my_rank == MPI_ROOT) {
    g_cell   = (int *) malloc(4*g_ncell*sizeof(int));
    g_edge   = (int *) malloc(2*g_nedge*sizeof(int));
    g_ecell  = (int *) malloc(2*g_nedge*sizeof(int));
    g_bedge  = (int *) malloc(2*g_nbedge*sizeof(int));
    g_becell = (int *) malloc(  g_nbedge*sizeof(int));
    g_bound  = (int *) malloc(  g_nbedge*sizeof(int));

    g_x      = (double *) malloc(2*g_nnode*sizeof(double));
    g_q      = (double *) malloc(4*g_ncell*sizeof(double));
    g_qold   = (double *) malloc(4*g_ncell*sizeof(double));
    g_res    = (double *) malloc(4*g_ncell*sizeof(double));
    g_adt    = (double *) malloc(  g_ncell*sizeof(double));

    for (int n=0; n<g_nnode; n++){
      check_scan(fscanf(fp,"%lf %lf \n",&g_x[2*n], &g_x[2*n+1]), 2);
    }

    for (int n=0; n<g_ncell; n++) {
      check_scan(fscanf(fp,"%d %d %d %d \n",&g_cell[4*n  ], &g_cell[4*n+1],
            &g_cell[4*n+2], &g_cell[4*n+3]), 4);
    }

    for (int n=0; n<g_nedge; n++) {
      check_scan(fscanf(fp,"%d %d %d %d \n",&g_edge[2*n],&g_edge[2*n+1],
            &g_ecell[2*n],&g_ecell[2*n+1]), 4);
    }

    for (int n=0; n<g_nbedge; n++) {
      check_scan(fscanf(fp,"%d %d %d %d \n",&g_bedge[2*n],&g_bedge[2*n+1],
            &g_becell[n],&g_bound[n]), 4);
    }

    //initialise flow field and residual

    for (int n=0; n<g_ncell; n++) {
      for (int m=0; m<4; m++) {
        g_q[4*n+m] = qinf[m];
        g_res[4*n+m] = 0.0f;
      }
    }
  }

  fclose(fp);

  nnode = compute_local_size (g_nnode, comm_size, my_rank);
  ncell = compute_local_size (g_ncell, comm_size, my_rank);
  nedge = compute_local_size (g_nedge, comm_size, my_rank);
  nbedge = compute_local_size (g_nbedge, comm_size, my_rank);

  op_printf("Number of nodes, cells, edges, bedges on process %d = %d, %d, %d, %d\n"
      ,my_rank,nnode,ncell,nedge,nbedge);

  /*Allocate memory to hold local sets, mapping tables and data*/
  cell   = (int *) malloc(4*ncell*sizeof(int));
  edge   = (int *) malloc(2*nedge*sizeof(int));
  ecell  = (int *) malloc(2*nedge*sizeof(int));
  bedge  = (int *) malloc(2*nbedge*sizeof(int));
  becell = (int *) malloc(  nbedge*sizeof(int));
  bound  = (int *) malloc(  nbedge*sizeof(int));

  x      = (double *) malloc(2*nnode*sizeof(double));
  q      = (double *) malloc(4*ncell*sizeof(double));
  qold   = (double *) malloc(4*ncell*sizeof(double));
  res    = (double *) malloc(4*ncell*sizeof(double));
  adt    = (double *) malloc(  ncell*sizeof(double));

  /* scatter sets, mappings and data on sets*/
  scatter_int_array(g_cell, cell, comm_size, g_ncell,ncell, 4);
  scatter_int_array(g_edge, edge, comm_size, g_nedge,nedge, 2);
  scatter_int_array(g_ecell, ecell, comm_size, g_nedge,nedge, 2);
  scatter_int_array(g_bedge, bedge, comm_size, g_nbedge,nbedge, 2);
  scatter_int_array(g_becell, becell, comm_size, g_nbedge,nbedge, 1);
  scatter_int_array(g_bound, bound, comm_size, g_nbedge,nbedge, 1);

  scatter_double_array(g_x, x, comm_size, g_nnode,nnode, 2);
  scatter_double_array(g_q, q, comm_size, g_ncell,ncell, 4);
  scatter_double_array(g_qold, qold, comm_size, g_ncell,ncell, 4);
  scatter_double_array(g_res, res, comm_size, g_ncell,ncell, 4);
  scatter_double_array(g_adt, adt, comm_size, g_ncell,ncell, 1);

  /*Freeing memory allocated to gloabal arrays on rank 0
    after scattering to all processes*/
  if(my_rank == MPI_ROOT) {
    free(g_cell);
    free(g_edge);
    free(g_ecell);
    free(g_bedge);
    free(g_becell);
    free(g_bound);
    free(g_x );
    free(g_q);
    free(g_qold);
    free(g_adt);
    free(g_res);
  }

  op_timers(&cpu_t2, &wall_t2);
  op_printf("Max total file read time = %f\n", wall_t2-wall_t1);

  /**------------------------END I/O and PARTITIONING -----------------------**/

  // declare sets, pointers, datasets and global constants

  op_set nodes  = op_decl_set(nnode,  "nodes");
  op_set edges  = op_decl_set(nedge,  "edges");
  op_set bedges = op_decl_set(nbedge, "bedges");
  op_set cells  = op_decl_set(ncell,  "cells");

  op_map pedge   = op_decl_map(edges, nodes,2,edge,  "pedge");
  op_map pecell  = op_decl_map(edges, cells,2,ecell, "pecell");
  op_map pbedge  = op_decl_map(bedges,nodes,2,bedge, "pbedge");
  op_map pbecell = op_decl_map(bedges,cells,1,becell,"pbecell");
  op_map pcell   = op_decl_map(cells, nodes,4,cell,  "pcell");

  op_dat p_bound = op_decl_dat(bedges,1,"int"  ,bound,"p_bound");
  op_dat p_x     = op_decl_dat(nodes ,2,"double",x    ,"p_x");
  op_dat p_q     = op_decl_dat(cells ,4,"double",q    ,"p_q");
  op_dat p_qold  = op_decl_dat(cells ,4,"double",qold ,"p_qold");
  op_dat p_adt   = op_decl_dat(cells ,1,"double",adt  ,"p_adt");
  op_dat p_res   = op_decl_dat(cells ,4,"double",res  ,"p_res");

  op_decl_const2("gam",1,"double",&gam);
  op_decl_const2("gm1",1,"double",&gm1);
  op_decl_const2("cfl",1,"double",&cfl);
  op_decl_const2("eps",1,"double",&eps);
  op_decl_const2("mach",1,"double",&mach);
  op_decl_const2("alpha",1,"double",&alpha);
  op_decl_const2("qinf",4,"double",qinf);

  op_diagnostic_output();

  //trigger partitioning and halo creation routines
  op_partition("PTSCOTCH", "KWAY", cells, pecell, p_x);
  //op_partition("PARMETIS", "KWAY", cells, pecell, p_x);

  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  niter = 1000;
  for(int iter=1; iter<=niter; iter++) {

    //save old flow solution
    op_par_loop_save_soln("save_soln",cells,
                op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE));

    //  predictor/corrector update loop

    for(int k=0; k<2; k++) {

      //    calculate area/timstep
      op_par_loop_adt_calc("adt_calc",cells,
                  op_arg_dat(p_x,0,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,2,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,3,pcell,2,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE));

      //    calculate flux residual
      op_par_loop_res_calc("res_calc",edges,
                  op_arg_dat(p_x,0,pedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pecell,4,"double",OP_READ),
                  op_arg_dat(p_q,1,pecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pecell,1,"double",OP_READ),
                  op_arg_dat(p_adt,1,pecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pecell,4,"double",OP_INC),
                  op_arg_dat(p_res,1,pecell,4,"double",OP_INC));

      op_par_loop_bres_calc("bres_calc",bedges,
                  op_arg_dat(p_x,0,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pbecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pbecell,4,"double",OP_INC),
                  op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ));

      //    update flow field

      rms = 0.0;

      op_par_loop_update("update",cells,
                  op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE),
                  op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ),
                  op_arg_gbl(&rms,1,"double",OP_INC));
    }

    //print iteration history
    rms = sqrt(rms/(double) g_ncell);
    if (iter%100 == 0)
      op_printf("%d  %10.5e \n",iter,rms);
  }

  op_timers(&cpu_t2, &wall_t2);

  //output the result dat array to files
  op_print_dat_to_txtfile(p_q, "out_grid_mpi.dat"); //ASCI
  op_print_dat_to_binfile(p_q, "out_grid_mpi.bin"); //Binary

  //write given op_dat's indicated segment of data to a memory block in the order it was originally
  //arranged (i.e. before partitioning and reordering)
  double* q_part = (double *)op_malloc(sizeof(double)*op_get_size(cells)*4);
  op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells)-1);
  free(q_part);

  op_timing_output();
  op_printf("Max total runtime = %f\n",wall_t2-wall_t1);

  op_exit();

  free(cell);
  free(edge);
  free(ecell);
  free(bedge);
  free(becell);
  free(bound);
  free(x);
  free(q);
  free(qold);
  free(res);
  free(adt);
}
Ejemplo n.º 5
0
int main(int argc, char **argv)
{
  // OP initialisation
  op_init(argc,argv,2);

  int    niter;
  double  rms;

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  // set constants and initialise flow field and residual
  op_printf("initialising flow field \n");

  char file[] = "new_grid.h5";

  // declare sets, pointers, datasets and global constants

  op_set nodes  = op_decl_set_hdf5(file, "nodes");
  op_set edges  = op_decl_set_hdf5(file,  "edges");
  op_set bedges = op_decl_set_hdf5(file, "bedges");
  op_set cells  = op_decl_set_hdf5(file,  "cells");

  op_map pedge   = op_decl_map_hdf5(edges, nodes, 2, file, "pedge");
  op_map pecell  = op_decl_map_hdf5(edges, cells,2, file, "pecell");
  op_map pbedge  = op_decl_map_hdf5(bedges,nodes,2, file, "pbedge");
  op_map pbecell = op_decl_map_hdf5(bedges,cells,1, file, "pbecell");
  op_map pcell   = op_decl_map_hdf5(cells, nodes,4, file, "pcell");

  op_map m_test  = op_decl_map_hdf5(cells, nodes,4, file, "m_test");
  if (m_test == NULL) printf("m_test not found\n");

  op_dat p_bound = op_decl_dat_hdf5(bedges,1,"int"  ,file,"p_bound");
  op_dat p_x     = op_decl_dat_hdf5(nodes ,2,"double",file,"p_x");
  op_dat p_q     = op_decl_dat_hdf5(cells ,4,"double",file,"p_q");
  op_dat p_qold  = op_decl_dat_hdf5(cells ,4,"double",file,"p_qold");
  op_dat p_adt   = op_decl_dat_hdf5(cells ,1,"double",file,"p_adt");
  op_dat p_res   = op_decl_dat_hdf5(cells ,4,"double",file,"p_res");

  op_dat p_test  = op_decl_dat_hdf5(cells ,4,"double",file,"p_test");
  if (p_test == NULL) printf("p_test not found\n");

  op_get_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid.h5");
  op_get_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid.h5");
  op_get_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid.h5");
  op_get_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid.h5");
  op_get_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid.h5");
  op_get_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid.h5");
  op_get_const_hdf5("qinf", 4, "double", (char *)&qinf, "new_grid.h5");

  op_decl_const2("gam",1,"double",&gam);
  op_decl_const2("gm1",1,"double",&gm1);
  op_decl_const2("cfl",1,"double",&cfl);
  op_decl_const2("eps",1,"double",&eps);
  op_decl_const2("mach",1,"double",&mach);
  op_decl_const2("alpha",1,"double",&alpha);
  op_decl_const2("qinf",4,"double",qinf);

  op_diagnostic_output();

  //write back original data just to compare you read the file correctly
  //do an h5diff between new_grid_out.h5 and new_grid.h5 to
  //compare two hdf5 files
  op_dump_to_hdf5("new_grid_out.h5");

  op_write_const_hdf5("gam",1,"double",(char *)&gam,  "new_grid_out.h5");
  op_write_const_hdf5("gm1",1,"double",(char *)&gm1,  "new_grid_out.h5");
  op_write_const_hdf5("cfl",1,"double",(char *)&cfl,  "new_grid_out.h5");
  op_write_const_hdf5("eps",1,"double",(char *)&eps,  "new_grid_out.h5");
  op_write_const_hdf5("mach",1,"double",(char *)&mach,  "new_grid_out.h5");
  op_write_const_hdf5("alpha",1,"double",(char *)&alpha,  "new_grid_out.h5");
  op_write_const_hdf5("qinf",4,"double",(char *)qinf,  "new_grid_out.h5");

  //trigger partitioning and halo creation routines
  op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x);
  //op_partition("PARMETIS", "KWAY", edges, pecell, p_x);

  int g_ncell = op_get_size(cells);


  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  // main time-marching loop

  niter = 1000;

  for(int iter=1; iter<=niter; iter++) {

    //  save old flow solution

    op_par_loop_save_soln("save_soln",cells,
                op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_WRITE));

    //  predictor/corrector update loop

    for(int k=0; k<2; k++) {

      //    calculate area/timstep

      op_par_loop_adt_calc("adt_calc",cells,
                  op_arg_dat(p_x,0,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,2,pcell,2,"double",OP_READ),
                  op_arg_dat(p_x,3,pcell,2,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_WRITE));

      //    calculate flux residual

      op_par_loop_res_calc("res_calc",edges,
                  op_arg_dat(p_x,0,pedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pecell,4,"double",OP_READ),
                  op_arg_dat(p_q,1,pecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pecell,1,"double",OP_READ),
                  op_arg_dat(p_adt,1,pecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pecell,4,"double",OP_INC),
                  op_arg_dat(p_res,1,pecell,4,"double",OP_INC));

      op_par_loop_bres_calc("bres_calc",bedges,
                  op_arg_dat(p_x,0,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_x,1,pbedge,2,"double",OP_READ),
                  op_arg_dat(p_q,0,pbecell,4,"double",OP_READ),
                  op_arg_dat(p_adt,0,pbecell,1,"double",OP_READ),
                  op_arg_dat(p_res,0,pbecell,4,"double",OP_INC),
                  op_arg_dat(p_bound,-1,OP_ID,1,"int",OP_READ));

      //    update flow field

      rms = 0.0;

      op_par_loop_update("update",cells,
                  op_arg_dat(p_qold,-1,OP_ID,4,"double",OP_READ),
                  op_arg_dat(p_q,-1,OP_ID,4,"double",OP_WRITE),
                  op_arg_dat(p_res,-1,OP_ID,4,"double",OP_RW),
                  op_arg_dat(p_adt,-1,OP_ID,1,"double",OP_READ),
                  op_arg_gbl(&rms,1,"double",OP_INC));
    }

    //  print iteration history

    rms = sqrt(rms/(double)g_ncell);

    if (iter%100 == 0)
      op_printf(" %d  %10.5e \n",iter,rms);
  }

  op_timers(&cpu_t2, &wall_t2);

  //write given op_dat's indicated segment of data to a memory block in the order it was originally
  //arranged (i.e. before partitioning and reordering)
  double* q = (double *)op_malloc(sizeof(double)*op_get_size(cells)*4);
  op_fetch_data_idx(p_q, q, 0, op_get_size(cells)-1);
  free(q);

  //write given op_dat's data to hdf5 file in the order it was originally arranged (i.e. before partitioning and reordering)
  op_fetch_data_hdf5_file(p_q, "file_name.h5");

  //printf("Root process = %d\n",op_is_root());

  //output the result dat array to files
  //op_dump_to_hdf5("new_grid_out.h5"); //writes data as it is held on each process (under MPI)

  //compress using
  // ~/hdf5/bin/h5repack -f GZIP=9 new_grid.h5 new_grid_pack.h5

  op_timing_output();
  op_printf("Max total runtime = %f\n",wall_t2-wall_t1);
  op_exit();
}
Ejemplo n.º 6
0
op_plan *op_plan_core(char const *name, op_set set, int part_size, int nargs,
                      op_arg *args, int ninds, int *inds, int staging) {
  // set exec length
  int exec_length = set->size;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_READ) {
      exec_length += set->exec_size;
      break;
    }
  }

  /* first look for an existing execution plan */

  int ip = 0, match = 0;

  while (match == 0 && ip < OP_plan_index) {
    if ((strcmp(name, OP_plans[ip].name) == 0) && (set == OP_plans[ip].set) &&
        (nargs == OP_plans[ip].nargs) && (ninds == OP_plans[ip].ninds) &&
        (part_size == OP_plans[ip].part_size)) {
      match = 1;
      for (int m = 0; m < nargs; m++) {
        if (args[m].dat != NULL && OP_plans[ip].dats[m] != NULL)
          match = match && (args[m].dat->size == OP_plans[ip].dats[m]->size) &&
                  (args[m].dat->dim == OP_plans[ip].dats[m]->dim) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);
        else
          match = match && (args[m].dat == OP_plans[ip].dats[m]) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);
      }
    }
    ip++;
  }

  if (match) {
    ip--;
    if (OP_diags > 3)
      printf(" old execution plan #%d\n", ip);
    OP_plans[ip].count++;
    return &(OP_plans[ip]);
  } else {
    if (OP_diags > 1)
      printf(" new execution plan #%d for kernel %s\n", ip, name);
  }
  double wall_t1, wall_t2, cpu_t1, cpu_t2;
  op_timers_core(&cpu_t1, &wall_t1);
  /* work out worst case shared memory requirement per element */

  int halo_exchange = 0;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_WRITE &&
        args[i].acc != OP_INC) {
      halo_exchange = 1;
      break;
    }
  }

  int maxbytes = 0;
  for (int m = 0; m < nargs; m++) {
    if (args[m].opt && inds[m] >= 0) {
      if ((staging == OP_STAGE_INC && args[m].acc == OP_INC) ||
          (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))
        maxbytes += args[m].dat->size;
    }
  }

  /* set blocksize and number of blocks; adaptive size based on 48kB of shared
   * memory */

  int bsize = part_size; // blocksize
  if (bsize == 0 && maxbytes > 0)
    bsize = MAX((24 * 1024 / (64 * maxbytes)) * 64,
                256); // 48kB exactly is too much, make it 24
  else if (bsize == 0 && maxbytes == 0)
    bsize = 256;

  // If we do 1 level of coloring, do it in one go
  if (staging == OP_COLOR2)
    bsize = exec_length;

  int nblocks = 0;

  int indirect_reduce = 0;
  for (int m = 0; m < nargs; m++) {
    indirect_reduce |=
        (args[m].acc != OP_READ && args[m].argtype == OP_ARG_GBL);
  }
  indirect_reduce &= (ninds > 0);

  /* Work out indirection arrays for OP_INCs */
  int ninds_staged = 0; // number of distinct (unique dat) indirect incs
  int *inds_staged = (int *)op_malloc(nargs * sizeof(int));
  int *inds_to_inds_staged = (int *)op_malloc(ninds * sizeof(int));

  for (int i = 0; i < nargs; i++)
    inds_staged[i] = -1;
  for (int i = 0; i < ninds; i++)
    inds_to_inds_staged[i] = -1;
  for (int i = 0; i < nargs; i++) {
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))) {
      if (inds_to_inds_staged[inds[i]] == -1) {
        inds_to_inds_staged[inds[i]] = ninds_staged;
        inds_staged[i] = ninds_staged;
        ninds_staged++;
      } else {
        inds_staged[i] = inds_to_inds_staged[inds[i]];
      }
    }
  }

  int *invinds_staged = (int *)op_malloc(ninds_staged * sizeof(int));
  for (int i = 0; i < ninds_staged; i++)
    invinds_staged[i] = -1;
  for (int i = 0; i < nargs; i++)
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE)) &&
        invinds_staged[inds_staged[i]] == -1)
      invinds_staged[inds_staged[i]] = i;

  int prev_offset = 0;
  int next_offset = 0;

  while (next_offset < exec_length) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;
    }
    nblocks++;
  }

  // If we do 1 level of coloring, we have a single "block"
  if (staging == OP_COLOR2) {
    nblocks = 1;
    prev_offset = 0;
    next_offset = exec_length;
  };

  /* enlarge OP_plans array if needed */

  if (ip == OP_plan_max) {
    // printf("allocating more memory for OP_plans %d\n", OP_plan_max);
    OP_plan_max += 10;
    OP_plans = (op_plan *)op_realloc(OP_plans, OP_plan_max * sizeof(op_plan));
    if (OP_plans == NULL) {
      printf(" op_plan error -- error reallocating memory for OP_plans\n");
      exit(-1);
    }
  }

  /* allocate memory for new execution plan and store input arguments */

  OP_plans[ip].dats = (op_dat *)op_malloc(nargs * sizeof(op_dat));
  OP_plans[ip].idxs = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].optflags = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].maps = (op_map *)op_malloc(nargs * sizeof(op_map));
  OP_plans[ip].accs = (op_access *)op_malloc(nargs * sizeof(op_access));
  OP_plans[ip].inds_staged =
      (op_access *)op_malloc(ninds_staged * sizeof(op_access));

  OP_plans[ip].nthrcol = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].thrcol = (int *)op_malloc(exec_length * sizeof(int));
  OP_plans[ip].col_reord = (int *)op_malloc((exec_length + 16) * sizeof(int));
  OP_plans[ip].col_offsets = NULL;
  OP_plans[ip].offset = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ind_maps = (int **)op_malloc(ninds_staged * sizeof(int *));
  OP_plans[ip].ind_offs =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].ind_sizes =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].nindirect = (int *)op_calloc(ninds, sizeof(int));
  OP_plans[ip].loc_maps = (short **)op_malloc(nargs * sizeof(short *));
  OP_plans[ip].nelems = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ncolblk =
      (int *)op_calloc(exec_length, sizeof(int)); /* max possibly needed */
  OP_plans[ip].blkmap = (int *)op_calloc(nblocks, sizeof(int));

  int *offsets = (int *)op_malloc((ninds_staged + 1) * sizeof(int));
  offsets[0] = 0;
  for (int m = 0; m < ninds_staged; m++) {
    int count = 0;
    for (int m2 = 0; m2 < nargs; m2++)
      if (inds_staged[m2] == m)
        count++;
    offsets[m + 1] = offsets[m] + count;
  }
  OP_plans[ip].ind_map =
      (int *)op_malloc(offsets[ninds_staged] * exec_length * sizeof(int));
  for (int m = 0; m < ninds_staged; m++) {
    OP_plans[ip].ind_maps[m] = &OP_plans[ip].ind_map[exec_length * offsets[m]];
  }
  free(offsets);

  int counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0)
      counter++;
    else
      OP_plans[ip].loc_maps[m] = NULL;

    OP_plans[ip].dats[m] = args[m].dat;
    OP_plans[ip].idxs[m] = args[m].idx;
    OP_plans[ip].optflags[m] = args[m].opt;
    OP_plans[ip].maps[m] = args[m].map;
    OP_plans[ip].accs[m] = args[m].acc;
  }

  OP_plans[ip].loc_map =
      (short *)op_malloc(counter * exec_length * sizeof(short));
  counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0) {
      OP_plans[ip].loc_maps[m] = &OP_plans[ip].loc_map[exec_length * (counter)];
      counter++;
    }
  }

  OP_plans[ip].name = name;
  OP_plans[ip].set = set;
  OP_plans[ip].nargs = nargs;
  OP_plans[ip].ninds = ninds;
  OP_plans[ip].ninds_staged = ninds_staged;
  OP_plans[ip].part_size = part_size;
  OP_plans[ip].nblocks = nblocks;
  OP_plans[ip].ncolors_core = 0;
  OP_plans[ip].ncolors_owned = 0;
  OP_plans[ip].count = 1;
  OP_plans[ip].inds_staged = inds_staged;

  OP_plan_index++;

  /* define aliases */

  op_dat *dats = OP_plans[ip].dats;
  int *idxs = OP_plans[ip].idxs;
  op_map *maps = OP_plans[ip].maps;
  op_access *accs = OP_plans[ip].accs;

  int *offset = OP_plans[ip].offset;
  int *nelems = OP_plans[ip].nelems;
  int **ind_maps = OP_plans[ip].ind_maps;
  int *ind_offs = OP_plans[ip].ind_offs;
  int *ind_sizes = OP_plans[ip].ind_sizes;
  int *nindirect = OP_plans[ip].nindirect;

  /* allocate working arrays */
  uint **work;
  work = (uint **)op_malloc(ninds * sizeof(uint *));

  for (int m = 0; m < ninds; m++) {
    int m2 = 0;
    while (inds[m2] != m)
      m2++;
    if (args[m2].opt == 0) {
      work[m] = NULL;
      continue;
    }

    int to_size = (maps[m2]->to)->exec_size + (maps[m2]->to)->nonexec_size +
                  (maps[m2]->to)->size;
    work[m] = (uint *)op_malloc(to_size * sizeof(uint));
  }

  int *work2;
  work2 =
      (int *)op_malloc(nargs * bsize * sizeof(int)); /* max possibly needed */

  /* process set one block at a time */

  float total_colors = 0;

  prev_offset = 0;
  next_offset = 0;
  for (int b = 0; b < nblocks; b++) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;
    }

    if (staging == OP_COLOR2) {
      prev_offset = 0;
      next_offset = exec_length;
    };
    int bs = next_offset - prev_offset;

    offset[b] = prev_offset; /* offset for block */
    nelems[b] = bs;          /* size of block */

    /* loop over indirection sets */
    for (int m = 0; m < ninds; m++) {
      int m2 = 0;
      while (inds[m2] != m)
        m2++;
      int m3 = inds_staged[m2];
      if (m3 < 0)
        continue;
      if (args[m2].opt == 0) {
        if (b == 0) {
          ind_offs[m3 + b * ninds_staged] = 0;
          ind_sizes[m3 + b * ninds_staged] = 0;
        } else {
          ind_offs[m3 + b * ninds_staged] =
              ind_offs[m3 + (b - 1) * ninds_staged];
          ind_sizes[m3 + b * ninds_staged] = 0;
        }
        continue;
      }
      /* build the list of elements indirectly referenced in this block */

      int ne = 0; /* number of elements */
      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            work2[ne++] = maps[m2]->map[idxs[m2] + e * maps[m2]->dim];
        }
      }

      /* sort them, then eliminate duplicates */

      qsort(work2, ne, sizeof(int), comp);

      int nde = 0;
      int p = 0;
      while (p < ne) {
        work2[nde] = work2[p];
        while (p < ne && work2[p] == work2[nde])
          p++;
        nde++;
      }
      ne = nde; /* number of distinct elements */

      /*
         if (OP_diags > 5) { printf(" indirection set %d: ",m); for (int e=0;
         e<ne; e++) printf("
         %d",work2[e]); printf(" \n"); } */

      /* store mapping and renumbered mappings in execution plan */

      for (int e = 0; e < ne; e++) {
        ind_maps[m3][nindirect[m]++] = work2[e];
        work[m][work2[e]] = e; // inverse mapping
      }

      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            OP_plans[ip].loc_maps[m2][e] =
                (short)(work[m][maps[m2]->map[idxs[m2] + e * maps[m2]->dim]]);
        }
      }

      if (b == 0) {
        ind_offs[m3 + b * ninds_staged] = 0;
        ind_sizes[m3 + b * ninds_staged] = nindirect[m];
      } else {
        ind_offs[m3 + b * ninds_staged] =
            ind_offs[m3 + (b - 1) * ninds_staged] +
            ind_sizes[m3 + (b - 1) * ninds_staged];
        ind_sizes[m3 + b * ninds_staged] =
            nindirect[m] - ind_offs[m3 + b * ninds_staged];
      }
    }

    /* now colour main set elements */

    for (int e = prev_offset; e < next_offset; e++)
      OP_plans[ip].thrcol[e] = -1;

    int repeat = 1;
    int ncolor = 0;
    int ncolors = 0;

    while (repeat) {
      repeat = 0;

      for (int m = 0; m < nargs; m++) {
        if (inds[m] >= 0 && args[m].opt)
          for (int e = prev_offset; e < next_offset; e++)
            work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] =
                0; /* zero out color array */
      }

      for (int e = prev_offset; e < next_offset; e++) {
        if (OP_plans[ip].thrcol[e] == -1) {
          int mask = 0;
          if (staging == OP_COLOR2 && halo_exchange && e >= set->core_size &&
              ncolor == 0)
            mask = 1;
          for (int m = 0; m < nargs; m++)
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                args[m].opt)
              mask |=
                  work[inds[m]]
                      [maps[m]->map[idxs[m] +
                                    e * maps[m]->dim]]; /* set bits of mask */

          int color = ffs(~mask) - 1; /* find first bit not set */
          if (color == -1) {          /* run out of colors on this pass */
            repeat = 1;
          } else {
            OP_plans[ip].thrcol[e] = ncolor + color;
            mask = 1 << color;
            ncolors = MAX(ncolors, ncolor + color + 1);

            for (int m = 0; m < nargs; m++)
              if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                  args[m].opt)
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |=
                    mask; /* set color bit */
          }
        }
      }

      ncolor += 32; /* increment base level */
    }

    OP_plans[ip].nthrcol[b] =
        ncolors; /* number of thread colors in this block */
    total_colors += ncolors;

    // if(ncolors>1) printf(" number of colors in this block = %d \n",ncolors);
  }

  /* create element permutation by color */
  if (staging == OP_STAGE_PERMUTE || staging == OP_COLOR2) {
    int size_of_col_offsets = 0;
    for (int b = 0; b < nblocks; b++) {
      size_of_col_offsets += OP_plans[ip].nthrcol[b] + 1;
    }
    // allocate
    OP_plans[ip].col_offsets = (int **)op_malloc(nblocks * sizeof(int *));
    int *col_offsets = (int *)op_malloc(size_of_col_offsets * sizeof(int *));

    size_of_col_offsets = 0;
    op_keyvalue *kv = (op_keyvalue *)op_malloc(bsize * sizeof(op_keyvalue));
    for (int b = 0; b < nblocks; b++) {
      int ncolor = OP_plans[ip].nthrcol[b];
      for (int e = 0; e < nelems[b]; e++) {
        kv[e].key = OP_plans[ip].thrcol[offset[b] + e];
        kv[e].value = e;
      }
      qsort(kv, nelems[b], sizeof(op_keyvalue), comp2);
      OP_plans[ip].col_offsets[b] = col_offsets + size_of_col_offsets;
      OP_plans[ip].col_offsets[b][0] = 0;
      size_of_col_offsets += (ncolor + 1);

      // Set up permutation and pointers to beginning of each color
      ncolor = 0;
      for (int e = 0; e < nelems[b]; e++) {
        OP_plans[ip].thrcol[offset[b] + e] = kv[e].key;
        OP_plans[ip].col_reord[offset[b] + e] = kv[e].value;
        if (e > 0)
          if (kv[e].key > kv[e - 1].key) {
            ncolor++;
            OP_plans[ip].col_offsets[b][ncolor] = e;
          }
      }
      OP_plans[ip].col_offsets[b][ncolor + 1] = nelems[b];
    }
    for (int i = exec_length; i < exec_length + 16; i++)
      OP_plans[ip].col_reord[i] = 0;
  }

  /* color the blocks, after initialising colors to 0 */

  int *blk_col;

  blk_col = (int *)op_malloc(nblocks * sizeof(int));
  for (int b = 0; b < nblocks; b++)
    blk_col[b] = -1;

  int repeat = 1;
  int ncolor = 0;
  int ncolors = 0;

  while (repeat) {
    repeat = 0;

    for (int m = 0; m < nargs; m++) {
      if (inds[m] >= 0 && args[m].opt) {
        int to_size = (maps[m]->to)->exec_size + (maps[m]->to)->nonexec_size +
                      (maps[m]->to)->size;
        for (int e = 0; e < to_size; e++)
          work[inds[m]][e] = 0; // zero out color arrays
      }
    }
    prev_offset = 0;
    next_offset = 0;
    for (int b = 0; b < nblocks; b++) {
      prev_offset = next_offset;

      if (prev_offset + bsize >= set->core_size &&
          prev_offset < set->core_size) {
        next_offset = set->core_size;
      } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
                 indirect_reduce) {
        next_offset = set->size;
      } else if (prev_offset + bsize >= exec_length &&
                 prev_offset < exec_length) {
        next_offset = exec_length;
      } else {
        next_offset = prev_offset + bsize;
      }
      if (blk_col[b] == -1) { // color not yet assigned to block
        uint mask = 0;
        if (next_offset > set->core_size) { // should not use block colors from
                                            // the core set when doing the
                                            // non_core ones
          if (prev_offset <= set->core_size)
            OP_plans[ip].ncolors_core = ncolors;
          for (int shifter = 0; shifter < OP_plans[ip].ncolors_core; shifter++)
            mask |= 1 << shifter;
          if (prev_offset == set->size && indirect_reduce)
            OP_plans[ip].ncolors_owned = ncolors;
          for (int shifter = OP_plans[ip].ncolors_core;
               indirect_reduce && shifter < OP_plans[ip].ncolors_owned;
               shifter++)
            mask |= 1 << shifter;
        }

        for (int m = 0; m < nargs; m++) {
          if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
              args[m].opt)
            for (int e = prev_offset; e < next_offset; e++)
              mask |= work[inds[m]]
                          [maps[m]->map[idxs[m] + e * maps[m]->dim]]; // set
                                                                      // bits of
                                                                      // mask
        }

        int color = ffs(~mask) - 1; // find first bit not set
        if (color == -1) {          // run out of colors on this pass
          repeat = 1;
        } else {
          blk_col[b] = ncolor + color;
          mask = 1 << color;
          ncolors = MAX(ncolors, ncolor + color + 1);

          for (int m = 0; m < nargs; m++) {
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                args[m].opt)
              for (int e = prev_offset; e < next_offset; e++)
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |= mask;
          }
        }
      }
    }

    ncolor += 32; // increment base level
  }

  /* store block mapping and number of blocks per color */

  if (indirect_reduce && OP_plans[ip].ncolors_owned == 0)
    OP_plans[ip].ncolors_owned =
        ncolors; // no MPI, so get the reduction arrays after everyting is done
  OP_plans[ip].ncolors = ncolors;
  if (staging == OP_COLOR2)
    OP_plans[ip].ncolors = OP_plans[ip].nthrcol[0];

  /*for(int col = 0; col = OP_plans[ip].ncolors;col++) //should initialize to
    zero because op_calloc returns garbage!!
    {
    OP_plans[ip].ncolblk[col] = 0;
    }*/

  for (int b = 0; b < nblocks; b++)
    OP_plans[ip].ncolblk[blk_col[b]]++; // number of blocks of each color

  for (int c = 1; c < ncolors; c++)
    OP_plans[ip].ncolblk[c] += OP_plans[ip].ncolblk[c - 1]; // cumsum

  for (int c = 0; c < ncolors; c++)
    work2[c] = 0;

  for (int b = 0; b < nblocks; b++) {
    int c = blk_col[b];
    int b2 = work2[c]; // number of preceding blocks of this color
    if (c > 0)
      b2 += OP_plans[ip].ncolblk[c - 1]; // plus previous colors

    OP_plans[ip].blkmap[b2] = b;

    work2[c]++; // increment counter
  }

  for (int c = ncolors - 1; c > 0; c--)
    OP_plans[ip].ncolblk[c] -= OP_plans[ip].ncolblk[c - 1]; // undo cumsum

  /* reorder blocks by color? */

  /* work out shared memory requirements */
  OP_plans[ip].nsharedCol = (int *)op_malloc(ncolors * sizeof(int));
  float total_shared = 0;
  for (int col = 0; col < ncolors; col++) {
    OP_plans[ip].nsharedCol[col] = 0;
    for (int b = 0; b < nblocks; b++) {
      if (blk_col[b] == col) {
        int nbytes = 0;
        for (int m = 0; m < ninds_staged; m++) {
          int m2 = 0;
          while (inds_staged[m2] != m)
            m2++;
          if (args[m2].opt == 0)
            continue;

          nbytes +=
              ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
        }
        OP_plans[ip].nsharedCol[col] =
            MAX(OP_plans[ip].nsharedCol[col], nbytes);
        total_shared += nbytes;
      }
    }
  }

  OP_plans[ip].nshared = 0;
  total_shared = 0;

  for (int b = 0; b < nblocks; b++) {
    int nbytes = 0;
    for (int m = 0; m < ninds_staged; m++) {
      int m2 = 0;
      while (inds_staged[m2] != m)
        m2++;
      if (args[m2].opt == 0)
        continue;

      nbytes += ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
    }
    OP_plans[ip].nshared = MAX(OP_plans[ip].nshared, nbytes);
    total_shared += nbytes;
  }

  /* work out total bandwidth requirements */

  OP_plans[ip].transfer = 0;
  OP_plans[ip].transfer2 = 0;
  float transfer3 = 0;

  if (staging != OP_COLOR2 && staging != OP_STAGE_INC) {
    for (int b = 0; b < nblocks; b++) {
      for (int m = 0; m < nargs; m++) // for each argument
      {
        if (args[m].opt) {
          if (inds[m] < 0) // if it is directly addressed
          {
            float fac = 2.0f;
            if (accs[m] == OP_READ ||
                accs[m] == OP_WRITE) // if you only read or write it
              fac = 1.0f;
            if (dats[m] != NULL) {
              OP_plans[ip].transfer +=
                  fac * nelems[b] * dats[m]->size; // cost of reading it all
              OP_plans[ip].transfer2 += fac * nelems[b] * dats[m]->size;
              transfer3 += fac * nelems[b] * dats[m]->size;
            }
          } else // if it is indirectly addressed: cost of reading the pointer
                 // to it
          {
            OP_plans[ip].transfer += nelems[b] * sizeof(short);
            OP_plans[ip].transfer2 += nelems[b] * sizeof(short);
            transfer3 += nelems[b] * sizeof(short);
          }
        }
      }
      for (int m = 0; m < ninds; m++) // for each indirect mapping
      {
        int m2 = 0;
        while (inds[m2] != m) // find the first argument that uses this mapping
          m2++;
        if (args[m2].opt == 0)
          continue;
        float fac = 2.0f;
        if (accs[m2] == OP_READ || accs[m2] == OP_WRITE) // only read it
          fac = 1.0f;
        if (staging == OP_STAGE_INC && accs[m2] != OP_INC) {
          OP_plans[ip].transfer += 1;
          OP_plans[ip].transfer2 += 1;
          continue;
        }
        OP_plans[ip].transfer +=
            fac * ind_sizes[m + b * ninds] *
            dats[m2]->size; // simply read all data one by one

        /* work out how many cache lines are used by indirect addressing */

        int i_map, l_new, l_old;
        int e0 = ind_offs[m + b * ninds];       // where it starts
        int e1 = e0 + ind_sizes[m + b * ninds]; // where it ends

        l_old = -1;

        for (int e = e0; e < e1;
             e++) // iterate through every indirectly accessed data element
        {
          i_map = ind_maps[m][e]; // the pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  OP_cache_line_size; // which cache line it is on (full size,
                                      // dim*sizeof(type))
          if (l_new > l_old) // if it is on a further cache line (that is not
                             // yet loaded, - i_map is ordered)
            OP_plans[ip].transfer2 +=
                fac * OP_cache_line_size; // load the cache line
          l_old = l_new;
          l_new = ((i_map + 1) * dats[m2]->size - 1) /
                  OP_cache_line_size; // the last byte of the data
          OP_plans[ip].transfer2 += fac * (l_new - l_old) *
                                    OP_cache_line_size; // again, if not loaded,
                                                        // load it (can be
                                                        // multiple cache lines)
          l_old = l_new;
        }

        l_old = -1;

        for (int e = e0; e < e1; e++) {
          i_map = ind_maps[m][e]; // pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  (dats[m2]->dim * OP_cache_line_size); // which cache line the
                                                        // first dimension of
                                                        // the data is on
          if (l_new > l_old)
            transfer3 +=
                fac * dats[m2]->dim *
                OP_cache_line_size; // if not loaded yet, load all cache lines
          l_old = l_new;
          l_new =
              ((i_map + 1) * dats[m2]->size - 1) /
              (dats[m2]->dim * OP_cache_line_size); // primitve type's last byte
          transfer3 += fac * (l_new - l_old) * dats[m2]->dim *
                       OP_cache_line_size; // load it
          l_old = l_new;
        }

        /* also include mappings to load/store data */

        fac = 1.0f;
        if (accs[m2] == OP_RW)
          fac = 2.0f;
        OP_plans[ip].transfer += fac * ind_sizes[m + b * ninds] * sizeof(int);
        OP_plans[ip].transfer2 += fac * ind_sizes[m + b * ninds] * sizeof(int);
        transfer3 += fac * ind_sizes[m + b * ninds] * sizeof(int);
      }
    }
  }

  /* print out useful information */

  if (OP_diags > 1) {
    printf(" number of blocks       = %d \n", nblocks);
    printf(" number of block colors = %d \n", OP_plans[ip].ncolors);
    printf(" maximum block size     = %d \n", bsize);
    printf(" average thread colors  = %.2f \n", total_colors / nblocks);
    printf(" shared memory required = ");
    for (int i = 0; i < ncolors - 1; i++)
      printf(" %.2f KB,", OP_plans[ip].nsharedCol[i] / 1024.0f);
    printf(" %.2f KB\n", OP_plans[ip].nsharedCol[ncolors - 1] / 1024.0f);
    printf(" average data reuse     = %.2f \n",
           maxbytes * (exec_length / total_shared));
    printf(" data transfer (used)   = %.2f MB \n",
           OP_plans[ip].transfer / (1024.0f * 1024.0f));
    printf(" data transfer (total)  = %.2f MB \n",
           OP_plans[ip].transfer2 / (1024.0f * 1024.0f));
    printf(" SoA/AoS transfer ratio = %.2f \n\n",
           transfer3 / OP_plans[ip].transfer2);
  }

  /* validate plan info */

  op_plan_check(OP_plans[ip], ninds_staged, inds_staged);

  /* free work arrays */

  for (int m = 0; m < ninds; m++)
    free(work[m]);
  free(work);
  free(work2);
  free(blk_col);
  free(inds_to_inds_staged);
  free(invinds_staged);
  op_timers_core(&cpu_t2, &wall_t2);
  for (int i = 0; i < OP_kern_max; i++) {
    if (strcmp(name, OP_kernels[i].name) == 0) {
      OP_kernels[i].plan_time += wall_t2 - wall_t1;
      break;
    }
  }
  /* return pointer to plan */
  OP_plan_time += wall_t2 - wall_t1;
  return &(OP_plans[ip]);
}
Ejemplo n.º 7
0
void op_plan_check(op_plan OP_plan, int ninds, int *inds) {
  // compute exec_length - which include the exec halo given certain conditions
  // (MPI)
  int exec_length = OP_plan.set->size;
  for (int m = 0; m < OP_plan.nargs; m++) {
    if (OP_plan.idxs[m] != -1 &&
        OP_plan.accs[m] != OP_READ) // if it needs exchaning
    {
      exec_length += OP_plan.set->exec_size;
      break;
    }
  }

  int err, ntot;

  int nblock = 0;
  for (int col = 0; col < OP_plan.ncolors; col++) {
    nblock += OP_plan.ncolblk[col];
  }

  /*
   * check total size
   */

  int nelem = 0;
  for (int n = 0; n < nblock; n++)
    nelem += OP_plan.nelems[n];

  if (nelem != exec_length) {
    printf(" *** OP_plan_check: nelems error \n");
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: nelems   OK \n");
  }

  /*
   * check offset and nelems are consistent
   */

  err = 0;
  ntot = 0;

  for (int n = 0; n < nblock; n++) {
    err += (OP_plan.offset[n] != ntot);
    ntot += OP_plan.nelems[n];
  }

  if (err != 0) {
    printf(" *** OP_plan_check: offset error \n");
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: offset   OK \n");
  }

  /*
   * check blkmap permutation
   */

  int *blkmap = (int *)op_malloc(nblock * sizeof(int));
  for (int n = 0; n < nblock; n++)
    blkmap[n] = OP_plan.blkmap[n];
  qsort(blkmap, nblock, sizeof(int), comp);

  err = 0;
  for (int n = 0; n < nblock; n++)
    err += (blkmap[n] != n);

  free(blkmap);

  if (err != 0) {
    printf(" *** OP_plan_check: blkmap error \n");
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: blkmap   OK \n");
  }

  /*
   * check ind_offs and ind_sizes are consistent
   */

  err = 0;

  for (int i = 0; i < ninds; i++) {
    ntot = 0;

    for (int n = 0; n < nblock; n++) {
      err += (OP_plan.ind_offs[i + n * ninds] != ntot);
      ntot += OP_plan.ind_sizes[i + n * ninds];
    }
  }

  if (err != 0) {
    printf(" *** OP_plan_check: ind_offs error \n");
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: ind_offs OK \n");
  }

  /*
   * check ind_maps correctly ordered within each block
   * and indices within range
   */

  err = 0;

  for (int m = 0; m < ninds; m++) {
    int m2 = 0;
    while (inds[m2] != m)
      m2++;
    if (OP_plan.maps[m2] == NULL)
      continue; // it is a deactivated optional argument
    int halo_size = (OP_plan.maps[m2]->to)->exec_size +
                    (OP_plan.maps[m2]->to)->nonexec_size;
    int set_size = OP_plan.maps[m2]->to->size + halo_size;

    ntot = 0;

    for (int n = 0; n < nblock; n++) {
      int last = -1;
      for (int e = ntot; e < ntot + OP_plan.ind_sizes[m + n * ninds]; e++) {
        err += (OP_plan.ind_maps[m][e] <= last);
        last = OP_plan.ind_maps[m][e];
      }
      err += (last >= set_size);
      ntot += OP_plan.ind_sizes[m + n * ninds];
    }
  }

  if (err != 0) {
    printf(" *** OP_plan_check: ind_maps error \n");
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: ind_maps OK \n");
  }

  /*
   *check maps (most likely source of errors)
   */

  err = 0;

  for (int m = 0; m < OP_plan.nargs; m++) {
    if (OP_plan.maps[m] != NULL && OP_plan.optflags[m] &&
        OP_plan.loc_maps[m] != NULL) {
      op_map map = OP_plan.maps[m];
      int m2 = inds[m];

      ntot = 0;
      for (int n = 0; n < nblock; n++) {
        for (int e = ntot; e < ntot + OP_plan.nelems[n]; e++) {
          int p_local = OP_plan.loc_maps[m][e];
          int p_global =
              OP_plan.ind_maps[m2][p_local + OP_plan.ind_offs[m2 + n * ninds]];
          err += (p_global != map->map[OP_plan.idxs[m] + e * map->dim]);
        }
        ntot += OP_plan.nelems[n];
      }
    }
  }

  if (err != 0) {
    printf(" *** OP_plan_check: %d maps error(s) \n", err);
  } else if (OP_diags > 6) {
    printf(" *** OP_plan_check: maps     OK \n");
  }

  /*
   * check thread and block coloring
   */

  return;
}
Ejemplo n.º 8
0
void* OpMemGroup::NewGRO(size_t size)
{
	// This allocator will not accept allocations of zero byte sizes
	OP_ASSERT(size > 0);

	// This is assumed below
	OP_ASSERT(MEMORY_GROUP_SIZE > MEMORY_GROUP_UNUSABLE_SIZE);

	//
	// Round up to nearest MEMORY_ALIGNMENT boundary to assure correct
	// alignment of objects.
	//
	size_t mask = (~(size_t)0) - (MEMORY_ALIGNMENT - 1);
	size = (size + MEMORY_ALIGNMENT - 1) & mask;

	if ( primary_size >= size )
	{
		// Satisfy request from primary memory area
		void* ptr = (void*)primary_ptr;
		primary_ptr += size;
		primary_size -= size;
		return ptr;
	}

	if ( secondary_size >= size )
	{
		// Satisfy request from secondary memory area
		void* ptr = (void*)secondary_ptr;
		secondary_ptr += size;
		secondary_size -= size;
		return ptr;
	}

	//
	// None of the primary or secondary memory areas where large enough to
	// satisfy the request, so we have to allocate a chunk of memory from
	// somewhere else first.
	//
	// Since we need to allocate something, determine size of the
	// header for chaining the allocations, since this will be needed
	// later:
	//
	const int header_size = 
		(sizeof(void*) > MEMORY_ALIGNMENT) ? sizeof(void*) : MEMORY_ALIGNMENT;

	//
	// Maybe the request failed because the size was very large? If
	// so, create a "private" allocation for the request. The limit is
	// set at 1/6th of the memory group size (chunk size), which is
	// really just an arbitrary number:
	//
	if ( size >= (MEMORY_GROUP_SIZE/6) )
	{
		void* ptr = op_malloc(size + header_size);
		if ( ptr == 0 )
			return 0;

		// Add this allocation to the chain of all allocations
		*(void**)ptr = all;
		all = ptr;
		return (void*)(((char*)ptr) + header_size);
	}

	//
	// The request was not large, so the two areas are either full, or
	// they have not been created yet, so create a new one:
	//
	void* fresh_area = op_malloc(MEMORY_GROUP_SIZE);
	if ( fresh_area == 0 )
		return 0;

	// Add this allocation to the chain of all allocations
	*(void**)fresh_area = all;
	all = fresh_area;

	//
	// Make sure the primary area is the one with the most bytes
	// free (since this is the one to be kept, and is the one to
	// be filled up first).
	//
	if ( primary_size < secondary_size )
	{
		// Swap primary and secondary

		char* tmp_ptr = primary_ptr;
		int tmp_size = primary_size;

		primary_ptr = secondary_ptr;
		primary_size = secondary_size;

		secondary_ptr = tmp_ptr;
		secondary_size = tmp_size;
	}

	void* ptr;

	if ( primary_size < MEMORY_GROUP_UNUSABLE_SIZE )
	{
		//
		// The primary, which is the one with the most free bytes, has
		// less free bytes than the threshold for what is considered
		// useful. Keep it for later none the less just in case, but
		// only as the secondary to speed up allocations from a new
		// fresh primary.
		//
		// This strategy will gain us something when allocations are
		// generally small in size. Profiling may reveal that this
		// optimization is not necessary.
		//
		secondary_ptr = primary_ptr;
		secondary_size = primary_size;

		// Give the fresh chunk of memory to the primary:
		primary_ptr = ((char*)fresh_area) + header_size;
		primary_size = MEMORY_GROUP_SIZE - header_size;

		// Satisfy the allocation from the fresh primary (this will
		// always work; the size is small)
		ptr = (void*)primary_ptr;
		primary_ptr += size;
		primary_size -= size;
	}
	else
	{
		//
		// The primary still has some useful free bytes, so keep it as
		// primary to increase its chance of contributing when a small
		// allocation comes along.
		//
		secondary_ptr = ((char*)fresh_area) + header_size;
		secondary_size = MEMORY_GROUP_SIZE - header_size;

		// Satisfy the allocation from the fresh secondary memory ares
		// (this will always work; the size is small)
		ptr = (void*)secondary_ptr;
		secondary_ptr += size;
		secondary_size -= size;
	}

	return ptr;
}
Ejemplo n.º 9
0
int main(int argc, char **argv)
{
  // OP initialisation
  op_init(argc,argv,2);

  int    *becell, *ecell,  *bound, *bedge, *edge, *cell;
  double  *x, *q, *qold, *adt, *res;

  int    nnode,ncell,nedge,nbedge,niter;
  double  rms;

  //timer
  double cpu_t1, cpu_t2, wall_t1, wall_t2;

  // read in grid

  op_printf("reading in grid \n");

  FILE *fp;
  if ( (fp = fopen("./new_grid.dat","r")) == NULL) {
    op_printf("can't open file new_grid.dat\n"); exit(-1);
  }

  if (fscanf(fp,"%d %d %d %d \n",&nnode, &ncell, &nedge, &nbedge) != 4) {
    op_printf("error reading from new_grid.dat\n"); exit(-1);
  }

  cell   = (int *) malloc(4*ncell*sizeof(int));
  edge   = (int *) malloc(2*nedge*sizeof(int));
  ecell  = (int *) malloc(2*nedge*sizeof(int));
  bedge  = (int *) malloc(2*nbedge*sizeof(int));
  becell = (int *) malloc(  nbedge*sizeof(int));
  bound  = (int *) malloc(  nbedge*sizeof(int));

  x      = (double *) malloc(2*nnode*sizeof(double));
  q      = (double *) malloc(4*ncell*sizeof(double));
  qold   = (double *) malloc(4*ncell*sizeof(double));
  res    = (double *) malloc(4*ncell*sizeof(double));
  adt    = (double *) malloc(  ncell*sizeof(double));

  for (int n=0; n<nnode; n++) {
    if (fscanf(fp,"%lf %lf \n",&x[2*n], &x[2*n+1]) != 2) {
      op_printf("error reading from new_grid.dat\n"); exit(-1);
    }
  }

  for (int n=0; n<ncell; n++) {
    if (fscanf(fp,"%d %d %d %d \n",&cell[4*n  ], &cell[4*n+1],
                                   &cell[4*n+2], &cell[4*n+3]) != 4) {
      op_printf("error reading from new_grid.dat\n"); exit(-1);
    }
  }

  for (int n=0; n<nedge; n++) {
    if (fscanf(fp,"%d %d %d %d \n",&edge[2*n], &edge[2*n+1],
                                   &ecell[2*n],&ecell[2*n+1]) != 4) {
      op_printf("error reading from new_grid.dat\n"); exit(-1);
    }
  }

  for (int n=0; n<nbedge; n++) {
    if (fscanf(fp,"%d %d %d %d \n",&bedge[2*n],&bedge[2*n+1],
                                   &becell[n], &bound[n]) != 4) {
      op_printf("error reading from new_grid.dat\n"); exit(-1);
    }
  }

  fclose(fp);

  // set constants and initialise flow field and residual

  op_printf("initialising flow field \n");

  gam = 1.4f;
  gm1 = gam - 1.0f;
  cfl = 0.9f;
  eps = 0.05f;

  double mach  = 0.4f;
  double alpha = 3.0f*atan(1.0f)/45.0f;
  double p     = 1.0f;
  double r     = 1.0f;
  double u     = sqrt(gam*p/r)*mach;
  double e     = p/(r*gm1) + 0.5f*u*u;

  qinf[0] = r;
  qinf[1] = r*u;
  qinf[2] = 0.0f;
  qinf[3] = r*e;

  for (int n=0; n<ncell; n++) {
    for (int m=0; m<4; m++) {
        q[4*n+m] = qinf[m];
      res[4*n+m] = 0.0f;
    }
  }

  // declare sets, pointers, datasets and global constants

  op_set nodes  = op_decl_set(nnode,  "nodes");
  op_set edges  = op_decl_set(nedge,  "edges");
  op_set bedges = op_decl_set(nbedge, "bedges");
  op_set cells  = op_decl_set(ncell,  "cells");

  op_map pedge   = op_decl_map(edges, nodes,2,edge,  "pedge");
  op_map pecell  = op_decl_map(edges, cells,2,ecell, "pecell");
  op_map pbedge  = op_decl_map(bedges,nodes,2,bedge, "pbedge");
  op_map pbecell = op_decl_map(bedges,cells,1,becell,"pbecell");
  op_map pcell   = op_decl_map(cells, nodes,4,cell,  "pcell");

  op_dat p_bound = op_decl_dat(bedges,1,"int"  ,bound,"p_bound");
  op_dat p_x     = op_decl_dat(nodes ,2,"double",x    ,"p_x");
  op_dat p_q     = op_decl_dat(cells ,4,"double",q    ,"p_q");
  op_dat p_qold  = op_decl_dat(cells ,4,"double",qold ,"p_qold");
  op_dat p_adt   = op_decl_dat(cells ,1,"double",adt  ,"p_adt");
  op_dat p_res   = op_decl_dat(cells ,4,"double",res  ,"p_res");

  op_decl_const(1,"double",&gam  );
  op_decl_const(1,"double",&gm1  );
  op_decl_const(1,"double",&cfl  );
  op_decl_const(1,"double",&eps  );
  op_decl_const(1,"double",&mach );
  op_decl_const(1,"double",&alpha);
  op_decl_const(4,"double",qinf  );

  op_diagnostic_output();

  //initialise timers for total execution wall time
  op_timers(&cpu_t1, &wall_t1);

  // main time-marching loop

  niter = 1000;

  for(int iter=1; iter<=niter; iter++) {

    // save old flow solution

    op_par_loop(save_soln,"save_soln", cells,
      op_arg_dat(p_q,   -1,OP_ID, 4,"double",OP_READ ),
      op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_WRITE));

    // predictor/corrector update loop

    for(int k=0; k<2; k++) {

      // calculate area/timstep

      op_par_loop(adt_calc,"adt_calc",cells,
          op_arg_dat(p_x,   0,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   1,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   2,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_x,   3,pcell, 2,"double",OP_READ ),
          op_arg_dat(p_q,  -1,OP_ID, 4,"double",OP_READ ),
          op_arg_dat(p_adt,-1,OP_ID, 1,"double",OP_WRITE));

      // calculate flux residual

      op_par_loop(res_calc,"res_calc",edges,
          op_arg_dat(p_x,    0,pedge, 2,"double",OP_READ),
          op_arg_dat(p_x,    1,pedge, 2,"double",OP_READ),
          op_arg_dat(p_q,    0,pecell,4,"double",OP_READ),
          op_arg_dat(p_q,    1,pecell,4,"double",OP_READ),
          op_arg_dat(p_adt,  0,pecell,1,"double",OP_READ),
          op_arg_dat(p_adt,  1,pecell,1,"double",OP_READ),
          op_arg_dat(p_res,  0,pecell,4,"double",OP_INC ),
          op_arg_dat(p_res,  1,pecell,4,"double",OP_INC ));

      op_par_loop(bres_calc,"bres_calc",bedges,
          op_arg_dat(p_x,     0,pbedge, 2,"double",OP_READ),
          op_arg_dat(p_x,     1,pbedge, 2,"double",OP_READ),
          op_arg_dat(p_q,     0,pbecell,4,"double",OP_READ),
          op_arg_dat(p_adt,   0,pbecell,1,"double",OP_READ),
          op_arg_dat(p_res,   0,pbecell,4,"double",OP_INC ),
          op_arg_dat(p_bound,-1,OP_ID  ,1,"int",  OP_READ));

      // update flow field

      rms = 0.0;

      op_par_loop(update,"update",cells,
          op_arg_dat(p_qold,-1,OP_ID, 4,"double",OP_READ ),
          op_arg_dat(p_q,   -1,OP_ID, 4,"double",OP_WRITE),
          op_arg_dat(p_res, -1,OP_ID, 4,"double",OP_RW   ),
          op_arg_dat(p_adt, -1,OP_ID, 1,"double",OP_READ ),
          op_arg_gbl(&rms,1,"double",OP_INC));
    }

    // print iteration history
    rms = sqrt(rms/(double) op_get_size(cells));
    if (iter%100 == 0)
      op_printf(" %d  %10.5e \n",iter,rms);
    if (iter%1000 == 0 && ncell == 720000){ //defailt mesh -- for validation testing
      //op_printf(" %d  %3.16lf \n",iter,rms);
      float diff=fabs((100.0*(rms/0.0001060114637578))-100.0);
      op_printf("\n\nTest problem with %d cells is within %3.15E %% of the expected solution\n",720000, diff);
      if(diff < 0.00001) {
        op_printf("This test is considered PASSED\n");
      }
      else {
        op_printf("This test is considered FAILED\n");
      }
    }
  }

  op_timers(&cpu_t2, &wall_t2);

  //output the result dat array to files
  op_print_dat_to_txtfile(p_q, "out_grid_seq.dat"); //ASCI
  op_print_dat_to_binfile(p_q, "out_grid_seq.bin"); //Binary

  //write given op_dat's indicated segment of data to a memory block in the order it was originally
  //arranged (i.e. before partitioning and reordering)
  double* q_part = (double *)op_malloc(sizeof(double)*op_get_size(cells)*4);
  op_fetch_data_idx(p_q, q_part, 0, op_get_size(cells)-1);
  free(q_part);

  op_timing_output();
  op_printf("Max total runtime = %f\n",wall_t2-wall_t1);

  op_exit();

  free(cell);
  free(edge);
  free(ecell);
  free(bedge);
  free(becell);
  free(bound);
  free(x);
  free(q);
  free(qold);
  free(res);
  free(adt);
}
Ejemplo n.º 10
0
int main(int argc, char **argv) {
  // OP initialisation
  op_init(argc, argv, 2);

  // MPI for user I/O
  int my_rank;
  int comm_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);

  int *becell, *ecell, *bound, *bedge, *edge, *cell;
  double *x, *q, *qold, *adt, *res;

  int nnode, ncell, nedge, nbedge;

  // set constants

  op_printf("initialising flow field\n");
  gam = 1.4f;
  gm1 = gam - 1.0f;
  cfl = 0.9f;
  eps = 0.05f;

  double mach = 0.4f;
  double alpha = 3.0f * atan(1.0f) / 45.0f;
  double p = 1.0f;
  double r = 1.0f;
  double u = sqrt(gam * p / r) * mach;
  double e = p / (r * gm1) + 0.5f * u * u;

  qinf[0] = r;
  qinf[1] = r * u;
  qinf[2] = 0.0f;
  qinf[3] = r * e;

  /**------------------------BEGIN  I/O -------------------**/

  char file[] = "new_grid.dat";
  char file_out[] = "new_grid_out.h5";

  /* read in grid from disk on root processor */
  FILE *fp;

  if ((fp = fopen(file, "r")) == NULL) {
    op_printf("can't open file %s\n", file);
    exit(-1);
  }

  int g_nnode, g_ncell, g_nedge, g_nbedge;

  check_scan(
      fscanf(fp, "%d %d %d %d \n", &g_nnode, &g_ncell, &g_nedge, &g_nbedge), 4);

  int *g_becell = 0, *g_ecell = 0, *g_bound = 0, *g_bedge = 0, *g_edge = 0,
      *g_cell = 0;
  double *g_x = 0, *g_q = 0, *g_qold = 0, *g_adt = 0, *g_res = 0;

  op_printf("reading in grid \n");
  op_printf("Global number of nodes, cells, edges, bedges = %d, %d, %d, %d\n",
            g_nnode, g_ncell, g_nedge, g_nbedge);

  if (my_rank == MPI_ROOT) {
    g_cell = (int *)op_malloc(4 * g_ncell * sizeof(int));
    g_edge = (int *)op_malloc(2 * g_nedge * sizeof(int));
    g_ecell = (int *)op_malloc(2 * g_nedge * sizeof(int));
    g_bedge = (int *)op_malloc(2 * g_nbedge * sizeof(int));
    g_becell = (int *)op_malloc(g_nbedge * sizeof(int));
    g_bound = (int *)op_malloc(g_nbedge * sizeof(int));

    g_x = (double *)op_malloc(2 * g_nnode * sizeof(double));
    g_q = (double *)op_malloc(4 * g_ncell * sizeof(double));
    g_qold = (double *)op_malloc(4 * g_ncell * sizeof(double));
    g_res = (double *)op_malloc(4 * g_ncell * sizeof(double));
    g_adt = (double *)op_malloc(g_ncell * sizeof(double));

    for (int n = 0; n < g_nnode; n++) {
      check_scan(fscanf(fp, "%lf %lf \n", &g_x[2 * n], &g_x[2 * n + 1]), 2);
    }

    for (int n = 0; n < g_ncell; n++) {
      check_scan(fscanf(fp, "%d %d %d %d \n", &g_cell[4 * n],
                        &g_cell[4 * n + 1], &g_cell[4 * n + 2],
                        &g_cell[4 * n + 3]),
                 4);
    }

    for (int n = 0; n < g_nedge; n++) {
      check_scan(fscanf(fp, "%d %d %d %d \n", &g_edge[2 * n],
                        &g_edge[2 * n + 1], &g_ecell[2 * n],
                        &g_ecell[2 * n + 1]),
                 4);
    }

    for (int n = 0; n < g_nbedge; n++) {
      check_scan(fscanf(fp, "%d %d %d %d \n", &g_bedge[2 * n],
                        &g_bedge[2 * n + 1], &g_becell[n], &g_bound[n]),
                 4);
    }

    // initialise flow field and residual

    for (int n = 0; n < g_ncell; n++) {
      for (int m = 0; m < 4; m++) {
        g_q[4 * n + m] = qinf[m];
        g_res[4 * n + m] = 0.0f;
      }
    }
  }

  fclose(fp);

  nnode = compute_local_size(g_nnode, comm_size, my_rank);
  ncell = compute_local_size(g_ncell, comm_size, my_rank);
  nedge = compute_local_size(g_nedge, comm_size, my_rank);
  nbedge = compute_local_size(g_nbedge, comm_size, my_rank);

  op_printf(
      "Number of nodes, cells, edges, bedges on process %d = %d, %d, %d, %d\n",
      my_rank, nnode, ncell, nedge, nbedge);

  /*Allocate memory to hold local sets, mapping tables and data*/
  cell = (int *)op_malloc(4 * ncell * sizeof(int));
  edge = (int *)op_malloc(2 * nedge * sizeof(int));
  ecell = (int *)op_malloc(2 * nedge * sizeof(int));
  bedge = (int *)op_malloc(2 * nbedge * sizeof(int));
  becell = (int *)op_malloc(nbedge * sizeof(int));
  bound = (int *)op_malloc(nbedge * sizeof(int));

  x = (double *)op_malloc(2 * nnode * sizeof(double));
  q = (double *)op_malloc(4 * ncell * sizeof(double));
  qold = (double *)op_malloc(4 * ncell * sizeof(double));
  res = (double *)op_malloc(4 * ncell * sizeof(double));
  adt = (double *)op_malloc(ncell * sizeof(double));

  /* scatter sets, mappings and data on sets*/
  scatter_int_array(g_cell, cell, comm_size, g_ncell, ncell, 4);
  scatter_int_array(g_edge, edge, comm_size, g_nedge, nedge, 2);
  scatter_int_array(g_ecell, ecell, comm_size, g_nedge, nedge, 2);
  scatter_int_array(g_bedge, bedge, comm_size, g_nbedge, nbedge, 2);
  scatter_int_array(g_becell, becell, comm_size, g_nbedge, nbedge, 1);
  scatter_int_array(g_bound, bound, comm_size, g_nbedge, nbedge, 1);

  scatter_double_array(g_x, x, comm_size, g_nnode, nnode, 2);
  scatter_double_array(g_q, q, comm_size, g_ncell, ncell, 4);
  scatter_double_array(g_qold, qold, comm_size, g_ncell, ncell, 4);
  scatter_double_array(g_res, res, comm_size, g_ncell, ncell, 4);
  scatter_double_array(g_adt, adt, comm_size, g_ncell, ncell, 1);

  /*Freeing memory allocated to gloabal arrays on rank 0
    after scattering to all processes*/
  if (my_rank == MPI_ROOT) {
    free(g_cell);
    free(g_edge);
    free(g_ecell);
    free(g_bedge);
    free(g_becell);
    free(g_bound);
    free(g_x);
    free(g_q);
    free(g_qold);
    free(g_adt);
    free(g_res);
  }

  /**------------------------END I/O  -----------------------**/

  /* FIXME: It's not clear to the compiler that sth. is going on behind the
     scenes here. Hence theses variables are reported as unused */

  op_set nodes = op_decl_set(nnode, "nodes");
  op_set edges = op_decl_set(nedge, "edges");
  op_set bedges = op_decl_set(nbedge, "bedges");
  op_set cells = op_decl_set(ncell, "cells");

  op_map pedge = op_decl_map(edges, nodes, 2, edge, "pedge");
  op_map pecell = op_decl_map(edges, cells, 2, ecell, "pecell");
  op_map pbedge = op_decl_map(bedges, nodes, 2, bedge, "pbedge");
  op_map pbecell = op_decl_map(bedges, cells, 1, becell, "pbecell");
  op_map pcell = op_decl_map(cells, nodes, 4, cell, "pcell");

  op_dat p_bound = op_decl_dat(bedges, 1, "int", bound, "p_bound");
  op_dat p_x = op_decl_dat(nodes, 2, "double", x, "p_x");
  op_dat p_q = op_decl_dat(cells, 4, "double", q, "p_q");
  op_dat p_qold = op_decl_dat(cells, 4, "double", qold, "p_qold");
  op_dat p_adt = op_decl_dat(cells, 1, "double", adt, "p_adt");
  op_dat p_res = op_decl_dat(cells, 4, "double", res, "p_res");

  /* Test out creating dataset within a nested path in an HDF5 file
  -- Remove when needing to create correct Airfoil mesh
  */
  op_dat p_x_test =
      op_decl_dat(nodes, 2, "double", x, "/group3/group2/group1/p_x_test");
  op_map pedge_test = op_decl_map(edges, nodes, 2, edge, "/group3/pedge_test");

  op_decl_const(1, "double", &gam);
  op_decl_const(1, "double", &gm1);
  op_decl_const(1, "double", &cfl);
  op_decl_const(1, "double", &eps);
  op_decl_const(1, "double", &mach);
  op_decl_const(1, "double", &alpha);
  op_decl_const(4, "double", qinf);

  op_partition("PTSCOTCH", "KWAY", edges, pecell, p_x);

  /* Test functionality of fetching data of an op_dat to an HDF5 file*/
  op_fetch_data_hdf5_file(p_x_test, "test.h5");

  char name[128];
  int time = 0;
  sprintf(name, "states_%07i", (int)(time * 100.0));
  op_fetch_data_hdf5_file_path(p_x_test, "test.h5", name);
  sprintf(name, "/results/states_%07i", (int)(time * 100.0));
  op_fetch_data_hdf5_file_path(p_x_test, "test.h5", name);

  /* Test functionality of dumping all the sets,maps and dats to an HDF5 file*/
  op_dump_to_hdf5(file_out);
  op_write_const_hdf5("gam", 1, "double", (char *)&gam, "new_grid_out.h5");
  op_write_const_hdf5("gm1", 1, "double", (char *)&gm1, "new_grid_out.h5");
  op_write_const_hdf5("cfl", 1, "double", (char *)&cfl, "new_grid_out.h5");
  op_write_const_hdf5("eps", 1, "double", (char *)&eps, "new_grid_out.h5");
  op_write_const_hdf5("mach", 1, "double", (char *)&mach, "new_grid_out.h5");
  op_write_const_hdf5("alpha", 1, "double", (char *)&alpha, "new_grid_out.h5");
  op_write_const_hdf5("qinf", 4, "double", (char *)qinf, "new_grid_out.h5");

  // create halos - for sanity check
  op_halo_create();

  op_exit();
}
Ejemplo n.º 11
0
void* WindowsOpThreadTools::Allocate(size_t size)
{
    return op_malloc(size);
}
Ejemplo n.º 12
0
RepList::RepList(int n) {
    dat = (replentry **) op_malloc(sizeof(replentry *) * n);
    if (dat == 0) size = 0; else size = n;
    pos = 0;
}