// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(2); op_timers_core(&cpu_t1, &wall_t1); int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg2.map_data[n * arg2.map->dim + 0]; int map3idx = arg2.map_data[n * arg2.map->dim + 1]; res_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg2.data)[4 * map2idx], &((double*)arg2.data)[4 * map3idx], &((double*)arg4.data)[1 * map2idx], &((double*)arg4.data)[1 * map3idx], &((double*)arg6.data)[4 * map2idx], &((double*)arg6.data)[4 * map3idx]); } } block_offset += nblocks; } OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
void op_x86_res_calc( int blockIdx, double *ind_arg0, int *ind_arg0_maps, double *ind_arg1, int *ind_arg1_maps, double *ind_arg2, int *ind_arg2_maps, double *ind_arg3, int *ind_arg3_maps, short *arg0_maps, short *arg1_maps, short *arg2_maps, short *arg3_maps, short *arg4_maps, short *arg5_maps, short *arg6_maps, short *arg7_maps, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors) { double arg6_l[4]; double arg7_l[4]; double *arg0_vec[2]; double *arg1_vec[2]; double *arg2_vec[2]; double *arg3_vec[2] = { arg6_l, arg7_l }; int *ind_arg0_map, ind_arg0_size; int *ind_arg1_map, ind_arg1_size; int *ind_arg2_map, ind_arg2_size; int *ind_arg3_map, ind_arg3_size; double *ind_arg0_s; double *ind_arg1_s; double *ind_arg2_s; double *ind_arg3_s; int nelem, offset_b; char shared[128000];// 64000]; //this size should not be staticly fixed if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*4]; ind_arg1_size = ind_arg_sizes[1+blockId*4]; ind_arg2_size = ind_arg_sizes[2+blockId*4]; ind_arg3_size = ind_arg_sizes[3+blockId*4]; ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*4]; ind_arg1_map = ind_arg1_maps + ind_arg_offs[1+blockId*4]; ind_arg2_map = ind_arg2_maps + ind_arg_offs[2+blockId*4]; ind_arg3_map = ind_arg3_maps + ind_arg_offs[3+blockId*4]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg0_size*sizeof(double)*2); ind_arg1_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg1_size*sizeof(double)*4); ind_arg2_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg2_size*sizeof(double)*1); ind_arg3_s = (double *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; for (int n=0; n<ind_arg1_size; n++) for (int d=0; d<4; d++) ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4]; for (int n=0; n<ind_arg2_size; n++) for (int d=0; d<1; d++) ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1]; for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3_s[d+n*4] = ZERO_double; // process set elements for (int n=0; n<nelem; n++) { // initialise local variables for (int d=0; d<4; d++) arg6_l[d] = ZERO_double; for (int d=0; d<4; d++) arg7_l[d] = ZERO_double; arg0_vec[0] = ind_arg0_s+arg0_maps[n+offset_b]*2; arg0_vec[1] = ind_arg0_s+arg1_maps[n+offset_b]*2; arg1_vec[0] = ind_arg1_s+arg2_maps[n+offset_b]*4; arg1_vec[1] = ind_arg1_s+arg3_maps[n+offset_b]*4; arg2_vec[0] = ind_arg2_s+arg4_maps[n+offset_b]*1; arg2_vec[1] = ind_arg2_s+arg5_maps[n+offset_b]*1; // user-supplied kernel call res_calc( arg0_vec, arg1_vec, arg2_vec, arg3_vec); // store local variables int arg6_map = arg6_maps[n+offset_b]; int arg7_map = arg7_maps[n+offset_b]; for (int d=0; d<4; d++) ind_arg3_s[d+arg6_map*4] += arg6_l[d]; for (int d=0; d<4; d++) ind_arg3_s[d+arg7_map*4] += arg7_l[d]; } // apply pointered write/increment for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4]; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; const double* arg0_vec[] = { &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx]}; const double* arg4_vec[] = { &((double*)arg4.data)[1 * map0idx], &((double*)arg4.data)[1 * map1idx], &((double*)arg4.data)[1 * map2idx], &((double*)arg4.data)[1 * map3idx]}; double* arg9_vec[] = { &((double*)arg9.data)[1 * map0idx], &((double*)arg9.data)[1 * map1idx], &((double*)arg9.data)[1 * map2idx], &((double*)arg9.data)[1 * map3idx]}; res_calc( arg0_vec, arg4_vec, &((double*)arg8.data)[16 * n], arg9_vec); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg4.size; OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg8.size; OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f; }
void op_x86_res_calc( int blockIdx, double *ind_arg0, double *ind_arg1, double *ind_arg2, double *ind_arg3, int *ind_map, short *arg_map, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors, int set_size) { double arg6_l[4]; double arg7_l[4]; int *ind_arg0_map, ind_arg0_size; int *ind_arg1_map, ind_arg1_size; int *ind_arg2_map, ind_arg2_size; int *ind_arg3_map, ind_arg3_size; double *ind_arg0_s; double *ind_arg1_s; double *ind_arg2_s; double *ind_arg3_s; int nelem, offset_b; char shared[128000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*4]; ind_arg1_size = ind_arg_sizes[1+blockId*4]; ind_arg2_size = ind_arg_sizes[2+blockId*4]; ind_arg3_size = ind_arg_sizes[3+blockId*4]; ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*4]; ind_arg1_map = &ind_map[2*set_size] + ind_arg_offs[1+blockId*4]; ind_arg2_map = &ind_map[4*set_size] + ind_arg_offs[2+blockId*4]; ind_arg3_map = &ind_map[6*set_size] + ind_arg_offs[3+blockId*4]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg0_size*sizeof(double)*2); ind_arg1_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg1_size*sizeof(double)*4); ind_arg2_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg2_size*sizeof(double)*1); ind_arg3_s = (double *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; for (int n=0; n<ind_arg1_size; n++) for (int d=0; d<4; d++) ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4]; for (int n=0; n<ind_arg2_size; n++) for (int d=0; d<1; d++) ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1]; for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3_s[d+n*4] = ZERO_double; // process set elements for (int n=0; n<nelem; n++) { // initialise local variables for (int d=0; d<4; d++) arg6_l[d] = ZERO_double; for (int d=0; d<4; d++) arg7_l[d] = ZERO_double; // user-supplied kernel call res_calc( ind_arg0_s+arg_map[0*set_size+n+offset_b]*2, ind_arg0_s+arg_map[1*set_size+n+offset_b]*2, ind_arg1_s+arg_map[2*set_size+n+offset_b]*4, ind_arg1_s+arg_map[3*set_size+n+offset_b]*4, ind_arg2_s+arg_map[4*set_size+n+offset_b]*1, ind_arg2_s+arg_map[5*set_size+n+offset_b]*1, arg6_l, arg7_l ); // store local variables int arg6_map = arg_map[6*set_size+n+offset_b]; int arg7_map = arg_map[7*set_size+n+offset_b]; for (int d=0; d<4; d++) ind_arg3_s[d+arg6_map*4] += arg6_l[d]; for (int d=0; d<4; d++) ind_arg3_s[d+arg7_map*4] += arg7_l[d]; } // apply pointered write/increment for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4]; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(2); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg2.map_data[n * arg2.map->dim + 0]; int map3idx = arg2.map_data[n * arg2.map->dim + 1]; res_calc( &((float*)arg0.data)[2 * map0idx], &((float*)arg0.data)[2 * map1idx], &((float*)arg2.data)[4 * map2idx], &((float*)arg2.data)[4 * map3idx], &((float*)arg4.data)[1 * map2idx], &((float*)arg4.data)[1 * map3idx], &((float*)arg6.data)[4 * map2idx], &((float*)arg6.data)[4 * map3idx]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += (float)set->size * arg0.size; OP_kernels[2].transfer += (float)set->size * arg2.size; OP_kernels[2].transfer += (float)set->size * arg4.size; OP_kernels[2].transfer += (float)set->size * arg6.size * 2.0f; OP_kernels[2].transfer += (float)set->size * arg0.map->dim * 4.0f; OP_kernels[2].transfer += (float)set->size * arg2.map->dim * 4.0f; }
void op_x86_res_calc( int blockIdx, double *ind_arg0, int *ind_arg0_maps, double *ind_arg1, int *ind_arg1_maps, double *ind_arg2, int *ind_arg2_maps, double *ind_arg3, int *ind_arg3_maps, short *arg0_maps, short *arg1_maps, short *arg2_maps, short *arg3_maps, short *arg4_maps, short *arg5_maps, short *arg6_maps, short *arg7_maps, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors) { double arg6_l[4]; double arg7_l[4]; int *ind_arg0_map, ind_arg0_size; int *ind_arg1_map, ind_arg1_size; int *ind_arg2_map, ind_arg2_size; int *ind_arg3_map, ind_arg3_size; double *ind_arg0_s; double *ind_arg1_s; double *ind_arg2_s; double *ind_arg3_s; int nelems2, ncolor; int nelem, offset_b; char shared[64000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; nelems2 = nelem; ncolor = ncolors[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*4]; ind_arg1_size = ind_arg_sizes[1+blockId*4]; ind_arg2_size = ind_arg_sizes[2+blockId*4]; ind_arg3_size = ind_arg_sizes[3+blockId*4]; ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*4]; ind_arg1_map = ind_arg1_maps + ind_arg_offs[1+blockId*4]; ind_arg2_map = ind_arg2_maps + ind_arg_offs[2+blockId*4]; ind_arg3_map = ind_arg3_maps + ind_arg_offs[3+blockId*4]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg0_size*sizeof(double)*2); ind_arg1_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg1_size*sizeof(double)*4); ind_arg2_s = (double *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg2_size*sizeof(double)*1); ind_arg3_s = (double *) &shared[nbytes]; } __syncthreads(); // make sure all of above completed // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; for (int n=0; n<ind_arg1_size; n++) for (int d=0; d<4; d++) ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4]; for (int n=0; n<ind_arg2_size; n++) for (int d=0; d<1; d++) ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1]; for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3_s[d+n*4] = ZERO_double; __syncthreads(); // process set elements for (int n=0; n<nelems2; n++) { int col2 = -1; if (n<nelem) { // initialise local variables for (int d=0; d<4; d++) arg6_l[d] = ZERO_double; for (int d=0; d<4; d++) arg7_l[d] = ZERO_double; // user-supplied kernel call res_calc( ind_arg0_s+arg0_maps[n+offset_b]*2, ind_arg0_s+arg1_maps[n+offset_b]*2, ind_arg1_s+arg2_maps[n+offset_b]*4, ind_arg1_s+arg3_maps[n+offset_b]*4, ind_arg2_s+arg4_maps[n+offset_b]*1, ind_arg2_s+arg5_maps[n+offset_b]*1, arg6_l, arg7_l ); col2 = colors[n+offset_b]; } // store local variables int arg6_map = arg6_maps[n+offset_b]; int arg7_map = arg7_maps[n+offset_b]; for (int col=0; col<ncolor; col++) { if (col2==col) { for (int d=0; d<4; d++) ind_arg3_s[d+arg6_map*4] += arg6_l[d]; for (int d=0; d<4; d++) ind_arg3_s[d+arg7_map*4] += arg7_l[d]; } __syncthreads(); } } // apply pointered write/increment for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4]; }
void op_x86_res_calc( int blockIdx, double *ind_arg0, int *ind_map, short *arg_map, int *arg1, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors, int set_size) { double arg0_l[4]; int *ind_arg0_map, ind_arg0_size; double *ind_arg0_s; int nelem, offset_b; char shared[128000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*1]; ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<4; d++) ind_arg0_s[d+n*4] = ZERO_double; // process set elements for (int n=0; n<nelem; n++) { // initialise local variables for (int d=0; d<4; d++) arg0_l[d] = ZERO_double; // user-supplied kernel call res_calc( arg0_l, arg1 ); // store local variables int arg0_map = arg_map[0*set_size+n+offset_b]; for (int d=0; d<4; d++) ind_arg0_s[d+arg0_map*4] += arg0_l[d]; } // apply pointered write/increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<4; d++) ind_arg0[d+ind_arg0_map[n]*4] += ind_arg0_s[d+n*4]; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(2); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[2].name = name; OP_kernels[2].count += 1; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size >0) { //Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; int *map2 = arg2.map_data_d; double *data0 = (double *)arg0.data_d; double *data2 = (double *)arg2.data_d; double *data4 = (double *)arg4.data_d; double *data6 = (double *)arg6.data_d; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; #pragma acc parallel loop independent deviceptr(col_reord,map0,map2,data0,data2,data4,data6) for ( int e=start; e<end; e++ ){ int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map2[n + set_size1 * 0]; int map3idx = map2[n + set_size1 * 1]; res_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data2[4 * map2idx], &data2[4 * map3idx], &data4[1 * map2idx], &data4[1 * map3idx], &data6[4 * map2idx], &data6[4 * map3idx]); } } OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[2].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; double* data8 = (double*)arg8.data_d; double *data0 = (double *)arg0.data_d; double *data4 = (double *)arg4.data_d; double *data9 = (double *)arg9.data_d; double *data13 = (double *)arg13.data_d; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; #pragma acc parallel loop independent deviceptr(col_reord,map0,data8,data0,data4,data9,data13) for ( int e=start; e<end; e++ ){ int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; const double* arg0_vec[] = { &data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx]}; const double* arg4_vec[] = { &data4[1 * map0idx], &data4[1 * map1idx], &data4[1 * map2idx], &data4[1 * map3idx]}; double* arg9_vec[] = { &data9[1 * map0idx], &data9[1 * map1idx], &data9[1 * map2idx], &data9[1 * map3idx]}; double* arg13_vec[] = { &data13[2 * map0idx], &data13[2 * map1idx], &data13[2 * map2idx], &data13[2 * map3idx]}; res_calc( arg0_vec, arg4_vec, &data8[n], arg9_vec, arg13_vec); } } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }