void op_par_loop_bres_calc(char const *name, op_set set, op_arg arg0, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 2; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "float", OP_READ); } args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; int ninds = 4; int inds[6] = {0,0,1,2,3,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: bres_calc\n"); } // get plan #ifdef OP_PART_SIZE_3 int part_size = OP_PART_SIZE_3; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(3); OP_kernels[3].name = name; OP_kernels[3].count += 1; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_bres_calc( blockIdx, (float *)arg0.data, (float *)arg2.data, (float *)arg3.data, (float *)arg4.data, Plan->ind_map, Plan->loc_map, (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(3); OP_kernels[3].transfer += Plan->transfer; OP_kernels[3].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[3].time += wall_t2 - wall_t1; }
void op_par_loop_bres_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 4; int inds[6] = {0,0,1,2,3,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: bres_calc \n"); } // get plan #ifdef OP_PART_SIZE_3 int part_size = OP_PART_SIZE_3; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_bres_calc( blockIdx, (float *)arg0.data, Plan->ind_maps[0], (float *)arg2.data, Plan->ind_maps[1], (float *)arg3.data, Plan->ind_maps[2], (float *)arg4.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(3); OP_kernels[3].name = name; OP_kernels[3].count += 1; OP_kernels[3].time += wall_t2 - wall_t1; OP_kernels[3].transfer += Plan->transfer; OP_kernels[3].transfer2 += Plan->transfer2; }