// host stub function
void op_par_loop_bres_calc_cpu(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5){

  int nargs = 6;
  op_arg args[6];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;

  // initialise timers
  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timing_realloc(3);
  op_timers_core(&cpu_t1, &wall_t1);

  int  ninds   = 4;
  int  inds[6] = {0,0,1,2,3,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: bres_calc\n");
  }

  // get plan
  #ifdef OP_PART_SIZE_3
    int part_size = OP_PART_SIZE_3;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    // execute plan
    int block_offset = 0;
    for ( int col=0; col<Plan->ncolors; col++ ){
      if (col==Plan->ncolors_core) {
        op_mpi_wait_all(nargs, args);
      }
      int nblocks = Plan->ncolblk[col];

      #pragma omp parallel for
      for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){
        int blockId  = Plan->blkmap[blockIdx + block_offset];
        int nelem    = Plan->nelems[blockId];
        int offset_b = Plan->offset[blockId];
        for ( int n=offset_b; n<offset_b+nelem; n++ ){
          int map0idx = arg0.map_data[n * arg0.map->dim + 0];
          int map1idx = arg0.map_data[n * arg0.map->dim + 1];
          int map2idx = arg2.map_data[n * arg2.map->dim + 0];

          bres_calc(
            &((double*)arg0.data)[2 * map0idx],
            &((double*)arg0.data)[2 * map1idx],
            &((double*)arg2.data)[4 * map2idx],
            &((double*)arg3.data)[1 * map2idx],
            &((double*)arg4.data)[4 * map2idx],
            &((int*)arg5.data)[1 * n]);
        }
      }

      block_offset += nblocks;
    }
    OP_kernels[3].transfer  += Plan->transfer;
    OP_kernels[3].transfer2 += Plan->transfer2;
  }

  if (set_size == 0 || set_size == set->core_size) {
    op_mpi_wait_all(nargs, args);
  }
  // combine reduction data
  op_mpi_set_dirtybit(nargs, args);

  // update kernel record
  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[3].name      = name;
  OP_kernels[3].count    += 1;
  OP_kernels[3].time     += wall_t2 - wall_t1;
}
void op_par_loop_adt_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5 ){                                                        
                                                                       
  int nargs   = 6;                                                   
  op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5};                     
                                                                        
  int    ninds   = 1;                                                   
  int    inds[6] = {0,0,0,0,-1,-1};   
  
  int sent[6] = {0,0,0,0,0,0}; 
               
  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if (OP_diags==1) reset_halo(args[i]);
      	      sent[0] = exchange_halo(args[i]); 
      	      if(sent[0] == 1)wait_all(args[i]);
      	  }
      }
  }
  
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: adt_calc \n");            
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_1                                                 
    int part_size = OP_PART_SIZE_1;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                 
  
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_adt_calc( blockIdx,                                         
       (double *)arg0.data, Plan->ind_maps[0],                           
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       (double *)arg4.data,                                              
       (double *)arg5.data,                                              
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }             
  
  
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE
  
                                                                        
  // update kernel record                                               
                                                                        
  op_timers(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(1);                                                 
  OP_kernels[1].name      = name;                                       
  OP_kernels[1].count    += 1;                                          
  OP_kernels[1].time     += wall_t2 - wall_t1;                          
  OP_kernels[1].transfer  += Plan->transfer;                            
  OP_kernels[1].transfer2 += Plan->transfer2;                           
}                                                                       
Example #3
0
void op_par_loop_res_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5,                                                          
  op_arg arg6,                                                          
  op_arg arg7 ){                                                        
                                                                        
                                                                        
  int    nargs   = 8;                                                   
  op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7};           
                                                                        
  int    ninds   = 4;                                                   
  int    inds[8] = {0,0,1,1,2,2,3,3};                                   
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res_calc \n");            
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_2                                                 
    int part_size = OP_PART_SIZE_2;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers_core(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_res_calc( blockIdx,                                         
       (double *)arg0.data, Plan->ind_maps[0],                          
       (double *)arg2.data, Plan->ind_maps[1],                          
       (double *)arg4.data, Plan->ind_maps[2],                          
       (double *)arg6.data, Plan->ind_maps[3],                          
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       Plan->loc_maps[4],                                               
       Plan->loc_maps[5],                                               
       Plan->loc_maps[6],                                               
       Plan->loc_maps[7],                                               
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // combine reduction data                                             
                                                                        
  // update kernel record                                               
                                                                        
  op_timers_core(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(2);                                                 
  OP_kernels[2].name      = name;                                       
  OP_kernels[2].count    += 1;                                          
  OP_kernels[2].time     += wall_t2 - wall_t1;                          
  OP_kernels[2].transfer  += Plan->transfer;                            
  OP_kernels[2].transfer2 += Plan->transfer2;                           
}                                                                       
Example #4
0
void op_par_loop_res_calc(char const *name, op_set set,
                          op_arg arg0,
                          op_arg arg1,
                          op_arg arg2,
                          op_arg arg3,
                          op_arg arg4,
                          op_arg arg5,
                          op_arg arg6,
                          op_arg arg7 ) {


    int    nargs   = 8;
    op_arg args[8];

    args[0] = arg0;
    args[1] = arg1;
    args[2] = arg2;
    args[3] = arg3;
    args[4] = arg4;
    args[5] = arg5;
    args[6] = arg6;
    args[7] = arg7;

    int    ninds   = 4;
    int    inds[8] = {0,0,1,1,2,2,3,3};

    if (OP_diags>2) {
        printf(" kernel routine with indirection: res_calc\n");
    }

    // get plan

#ifdef OP_PART_SIZE_2
    int part_size = OP_PART_SIZE_2;
#else
    int part_size = OP_part_size;
#endif

    int set_size = op_mpi_halo_exchanges(set, nargs, args);

    // initialise timers

    double cpu_t1, cpu_t2, wall_t1, wall_t2;
    op_timers_core(&cpu_t1, &wall_t1);

    if (set->size >0) {


        op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
        // execute plan

        int block_offset = 0;

        for (int col=0; col < Plan->ncolors; col++) {
            if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

            int nblocks = Plan->ncolblk[col];

            #pragma omp parallel for
            for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
                op_x86_res_calc( blockIdx,
                                 (double *)arg0.data,
                                 (double *)arg2.data,
                                 (double *)arg4.data,
                                 (double *)arg6.data,
                                 Plan->ind_map,
                                 Plan->loc_map,
                                 Plan->ind_sizes,
                                 Plan->ind_offs,
                                 block_offset,
                                 Plan->blkmap,
                                 Plan->offset,
                                 Plan->nelems,
                                 Plan->nthrcol,
                                 Plan->thrcol,
                                 set_size);

            block_offset += nblocks;
        }

        op_timing_realloc(2);
        OP_kernels[2].transfer  += Plan->transfer;
        OP_kernels[2].transfer2 += Plan->transfer2;

    }


    // combine reduction data

    op_mpi_set_dirtybit(nargs, args);

    // update kernel record

    op_timers_core(&cpu_t2, &wall_t2);
    op_timing_realloc(2);
    OP_kernels[2].name      = name;
    OP_kernels[2].count    += 1;
    OP_kernels[2].time     += wall_t2 - wall_t1;
}
void op_par_loop_SpaceDiscretization(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3,
  op_arg arg4,
  op_arg arg5,
  op_arg arg6 ){


  int    nargs   = 8;
  op_arg args[8];

  args[0] = arg0;
  args[1] = arg1;
  args[2] = arg2;
  args[3] = arg3;
  args[4] = arg4;
  args[5] = arg5;
  arg6.idx = 0;
  args[6] = arg6;
  for (int v = 1; v < 2; v++) {
    args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ);
  }

  int    ninds   = 2;
  int    inds[8] = {0,0,-1,-1,-1,-1,1,1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: SpaceDiscretization\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_18
    int part_size = OP_PART_SIZE_18;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(18);
  OP_kernels[18].name      = name;
  OP_kernels[18].count    += 1;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_SpaceDiscretization( blockIdx,
         (float *)arg0.data,
         (float *)arg6.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg2.data,
         (float *)arg3.data,
         (float *)arg4.data,
         (int *)arg5.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(18);
  OP_kernels[18].transfer  += Plan->transfer;
  OP_kernels[18].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[18].time     += wall_t2 - wall_t1;
}
Example #6
0
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg4,
  op_arg arg8,
  op_arg arg9 ){


  int    nargs   = 13;
  op_arg args[13];

  arg0.idx = 0;
  args[0] = arg0;
  for (int v = 1; v < 4; v++) {
    args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ);
  }
  arg4.idx = 0;
  args[4] = arg4;
  for (int v = 1; v < 4; v++) {
    args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ);
  }
  args[8] = arg8;
  arg9.idx = 0;
  args[9] = arg9;
  for (int v = 1; v < 4; v++) {
    args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC);
  }

  int    ninds   = 3;
  int    inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         (double *)arg4.data,
         (double *)arg9.data,
         Plan->ind_map,
         Plan->loc_map,
         (double *)arg8.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
Example #7
0
void op_par_loop_res(char const *name, op_set set,                      
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3 ){                                                        
                                                                        
  float *arg3h = (float *)arg3.data;                                    
                                                                        
  int    nargs   = 4;                                                   
  op_arg args[4] = {arg0,arg1,arg2,arg3};                               
                                                                        
  int    ninds   = 2;                                                   
  int    inds[4] = {-1,0,1,-1};                                         
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res \n");                 
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_0                                                 
    int part_size = OP_PART_SIZE_0;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                                                                        
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_res( blockIdx,                                              
       (float *)arg1.data, Plan->ind_maps[0],                           
       (float *)arg2.data, Plan->ind_maps[1],                           
       (float *)arg0.data,                                              
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       (float *)arg3.data,                                              
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }                                                                     
                                                                        
  // update kernel record                                               
                                                                        
  op_timers(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(0);                                                 
  OP_kernels[0].name      = name;                                       
  OP_kernels[0].count    += 1;                                          
  OP_kernels[0].time     += wall_t2 - wall_t1;                          
  OP_kernels[0].transfer  += Plan->transfer;                            
  OP_kernels[0].transfer2 += Plan->transfer2;                           
}                                                                       
void op_par_loop_res_calc(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1 ){

  int *arg1h = (int *)arg1.data;

  int    nargs   = 2;
  op_arg args[2];

  args[0] = arg0;
  args[1] = arg1;

  int    ninds   = 1;
  int    inds[2] = {0,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res_calc\n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0;
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;

  // set number of threads

#ifdef _OPENMP
  int nthreads = omp_get_max_threads( );
#else
  int nthreads = 1;
#endif

  // allocate and initialise arrays for global reduction

  int arg1_l[1+64*64];
  for (int thr=0; thr<nthreads; thr++)
    for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int;

  if (set->size >0) {

    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);

    op_timers_core(&cpu_t1, &wall_t1);

    // execute plan

    int block_offset = 0;

    for (int col=0; col < Plan->ncolors; col++) {
      if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

      int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res_calc( blockIdx,
         (double *)arg0.data,
         Plan->ind_map,
         Plan->loc_map,
         &arg1_l[64*omp_get_thread_num()],
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);


  // combine reduction data
    if (col == Plan->ncolors_owned-1) {
      for (int thr=0; thr<nthreads; thr++)
        for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64];
    }

      block_offset += nblocks;
    }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_reduce(&arg1,arg1h);

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
Example #9
0
void op_par_loop_res(char const *name, op_set set,
  op_arg arg0,
  op_arg arg1,
  op_arg arg2,
  op_arg arg3 ){

  float *arg3h = (float *)arg3.data;

  int    nargs   = 4;
  op_arg args[4] = {arg0,arg1,arg2,arg3};

  int    ninds   = 2;
  int    inds[4] = {-1,0,1,-1};

  if (OP_diags>2) {
    printf(" kernel routine with indirection: res \n");
  }

  // get plan

  #ifdef OP_PART_SIZE_0
    int part_size = OP_PART_SIZE_0;
  #else
    int part_size = OP_part_size;
  #endif

  int set_size = op_mpi_halo_exchanges(set, nargs, args);

  // initialise timers

  double cpu_t1, cpu_t2, wall_t1, wall_t2;
  op_timers_core(&cpu_t1, &wall_t1);

  if (set->size >0) {


  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
  // execute plan

  int block_offset = 0;

  for (int col=0; col < Plan->ncolors; col++) {
    if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args);

    int nblocks = Plan->ncolblk[col];

#pragma omp parallel for
      for (int blockIdx=0; blockIdx<nblocks; blockIdx++)
      op_x86_res( blockIdx,
         (float *)arg1.data,
         (float *)arg2.data,
         Plan->ind_map,
         Plan->loc_map,
         (float *)arg0.data,
         (float *)arg3.data,
         Plan->ind_sizes,
         Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
         Plan->nelems,
         Plan->nthrcol,
         Plan->thrcol,
         set_size);

    block_offset += nblocks;
  }

  op_timing_realloc(0);
  OP_kernels[0].transfer  += Plan->transfer;
  OP_kernels[0].transfer2 += Plan->transfer2;

  }


  // combine reduction data

  op_mpi_set_dirtybit(nargs, args);

  // update kernel record

  op_timers_core(&cpu_t2, &wall_t2);
  op_timing_realloc(0);
  OP_kernels[0].name      = name;
  OP_kernels[0].count    += 1;
  OP_kernels[0].time     += wall_t2 - wall_t1;
}
void bres_calc_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2,op_arg opDat3,op_arg opDat4,op_arg opDat5,op_arg opDat6)
{
  size_t blocksPerGrid;
  size_t threadsPerBlock;
  size_t totalThreadNumber;
  size_t dynamicSharedMemorySize;
  cl_int errorCode;
  cl_event event;
  cl_kernel kernelPointer;
  int i3;
  op_arg opDatArray[6];
  int indirectionDescriptorArray[6];
  op_plan *planRet;
  int blockOffset;
  opDatArray[0] = opDat1;
  opDatArray[1] = opDat2;
  opDatArray[2] = opDat3;
  opDatArray[3] = opDat4;
  opDatArray[4] = opDat5;
  opDatArray[5] = opDat6;
  indirectionDescriptorArray[0] = 0;
  indirectionDescriptorArray[1] = 0;
  indirectionDescriptorArray[2] = 1;
  indirectionDescriptorArray[3] = 2;
  indirectionDescriptorArray[4] = 3;
  indirectionDescriptorArray[5] = -1;
  planRet = op_plan_get(userSubroutine,set,setPartitionSize_bres_calc,6,opDatArray,4,indirectionDescriptorArray);
  cl_mem gm1_d;
  gm1_d = op_allocate_constant(&gm1,sizeof(float ));
  cl_mem qinf_d;
  qinf_d = op_allocate_constant(&qinf,4 * sizeof(float));
  cl_mem eps_d;
  eps_d = op_allocate_constant(&eps,sizeof(float ));
  blockOffset = 0;
  double cpu_t1;
  double cpu_t2;
  double wall_t1;
op_timers(&cpu_t1, &wall_t1);
  double wall_t2;
  for (i3 = 0; i3 < planRet -> ncolors; ++i3) {
    blocksPerGrid = planRet -> ncolblk[i3];
    dynamicSharedMemorySize = planRet -> nshared;
    threadsPerBlock = threadsPerBlockSize_bres_calc;
    totalThreadNumber = threadsPerBlock * blocksPerGrid;
    kernelPointer = getKernel("bres_calc_kernel");
    errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat3.data_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(cl_mem ),&opDat4.data_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(cl_mem ),&opDat5.data_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(cl_mem ),&opDat6.data_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,5,sizeof(cl_mem ),&planRet -> ind_maps[0]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,6,sizeof(cl_mem ),&planRet -> ind_maps[1]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,7,sizeof(cl_mem ),&planRet -> ind_maps[2]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,8,sizeof(cl_mem ),&planRet -> ind_maps[3]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,9,sizeof(cl_mem ),&planRet -> loc_maps[0]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,10,sizeof(cl_mem ),&planRet -> loc_maps[1]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,11,sizeof(cl_mem ),&planRet -> loc_maps[2]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,12,sizeof(cl_mem ),&planRet -> loc_maps[3]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,13,sizeof(cl_mem ),&planRet -> loc_maps[4]);
    errorCode = errorCode | clSetKernelArg(kernelPointer,14,sizeof(cl_mem ),&planRet -> ind_sizes);
    errorCode = errorCode | clSetKernelArg(kernelPointer,15,sizeof(cl_mem ),&planRet -> ind_offs);
    errorCode = errorCode | clSetKernelArg(kernelPointer,16,sizeof(cl_mem ),&planRet -> blkmap);
    errorCode = errorCode | clSetKernelArg(kernelPointer,17,sizeof(cl_mem ),&planRet -> offset);
    errorCode = errorCode | clSetKernelArg(kernelPointer,18,sizeof(cl_mem ),&planRet -> nelems);
    errorCode = errorCode | clSetKernelArg(kernelPointer,19,sizeof(cl_mem ),&planRet -> nthrcol);
    errorCode = errorCode | clSetKernelArg(kernelPointer,20,sizeof(cl_mem ),&planRet -> thrcol);
    errorCode = errorCode | clSetKernelArg(kernelPointer,21,sizeof(int ),&blockOffset);
    errorCode = errorCode | clSetKernelArg(kernelPointer,22,dynamicSharedMemorySize,NULL);
    errorCode = errorCode | clSetKernelArg(kernelPointer,23,sizeof(cl_mem ),&gm1_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,24,sizeof(cl_mem ),&qinf_d);
    errorCode = errorCode | clSetKernelArg(kernelPointer,25,sizeof(cl_mem ),&eps_d);
    assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments");
    errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event);
    assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel");
    errorCode = clFinish(cqCommandQueue);
    assert_m(errorCode == CL_SUCCESS,"Error completing device command queue");
    blockOffset += blocksPerGrid;
  }
op_timers(&cpu_t2, &wall_t2);
op_timing_realloc(0);
  OP_kernels[1].name = userSubroutine;
  OP_kernels[1].count = OP_kernels[1].count + 1;
}