// host stub function void op_par_loop_bres_calc_cpu(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(3); op_timers_core(&cpu_t1, &wall_t1); int ninds = 4; int inds[6] = {0,0,1,2,3,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: bres_calc\n"); } // get plan #ifdef OP_PART_SIZE_3 int part_size = OP_PART_SIZE_3; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg2.map_data[n * arg2.map->dim + 0]; bres_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg2.data)[4 * map2idx], &((double*)arg3.data)[1 * map2idx], &((double*)arg4.data)[4 * map2idx], &((int*)arg5.data)[1 * n]); } } block_offset += nblocks; } OP_kernels[3].transfer += Plan->transfer; OP_kernels[3].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[3].name = name; OP_kernels[3].count += 1; OP_kernels[3].time += wall_t2 - wall_t1; }
void op_x86_bres_calc( int blockIdx, float *ind_arg0, float *ind_arg1, float *ind_arg2, float *ind_arg3, int *ind_map, short *arg_map, int *arg5, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors, int set_size) { float arg4_l[4]; float *arg0_vec[2]; int *ind_arg0_map, ind_arg0_size; int *ind_arg1_map, ind_arg1_size; int *ind_arg2_map, ind_arg2_size; int *ind_arg3_map, ind_arg3_size; float *ind_arg0_s; float *ind_arg1_s; float *ind_arg2_s; float *ind_arg3_s; int nelem, offset_b; char shared[128000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*4]; ind_arg1_size = ind_arg_sizes[1+blockId*4]; ind_arg2_size = ind_arg_sizes[2+blockId*4]; ind_arg3_size = ind_arg_sizes[3+blockId*4]; ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*4]; ind_arg1_map = &ind_map[2*set_size] + ind_arg_offs[1+blockId*4]; ind_arg2_map = &ind_map[3*set_size] + ind_arg_offs[2+blockId*4]; ind_arg3_map = &ind_map[4*set_size] + ind_arg_offs[3+blockId*4]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg0_size*sizeof(float)*2); ind_arg1_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg1_size*sizeof(float)*4); ind_arg2_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg2_size*sizeof(float)*1); ind_arg3_s = (float *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; for (int n=0; n<ind_arg1_size; n++) for (int d=0; d<4; d++) ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4]; for (int n=0; n<ind_arg2_size; n++) for (int d=0; d<1; d++) ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1]; for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3_s[d+n*4] = ZERO_float; // process set elements for (int n=0; n<nelem; n++) { // initialise local variables for (int d=0; d<4; d++) arg4_l[d] = ZERO_float; arg0_vec[0] = ind_arg0_s+arg_map[0*set_size+n+offset_b]*2; arg0_vec[1] = ind_arg0_s+arg_map[1*set_size+n+offset_b]*2; // user-supplied kernel call bres_calc( arg0_vec, ind_arg1_s+arg_map[2*set_size+n+offset_b]*4, ind_arg2_s+arg_map[3*set_size+n+offset_b]*1, arg4_l, arg5+(n+offset_b)*1 ); // store local variables int arg4_map = arg_map[4*set_size+n+offset_b]; for (int d=0; d<4; d++) ind_arg3_s[d+arg4_map*4] += arg4_l[d]; } // apply pointered write/increment for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4]; }
void op_x86_bres_calc( int blockIdx, float *ind_arg0, int *ind_arg0_maps, float *ind_arg1, int *ind_arg1_maps, float *ind_arg2, int *ind_arg2_maps, float *ind_arg3, int *ind_arg3_maps, short *arg0_maps, short *arg1_maps, short *arg2_maps, short *arg3_maps, short *arg4_maps, int *arg5, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors) { float arg4_l[4]; int *ind_arg0_map, ind_arg0_size; int *ind_arg1_map, ind_arg1_size; int *ind_arg2_map, ind_arg2_size; int *ind_arg3_map, ind_arg3_size; float *ind_arg0_s; float *ind_arg1_s; float *ind_arg2_s; float *ind_arg3_s; int nelems2, ncolor; int nelem, offset_b; char shared[64000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; nelems2 = nelem; ncolor = ncolors[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*4]; ind_arg1_size = ind_arg_sizes[1+blockId*4]; ind_arg2_size = ind_arg_sizes[2+blockId*4]; ind_arg3_size = ind_arg_sizes[3+blockId*4]; ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*4]; ind_arg1_map = ind_arg1_maps + ind_arg_offs[1+blockId*4]; ind_arg2_map = ind_arg2_maps + ind_arg_offs[2+blockId*4]; ind_arg3_map = ind_arg3_maps + ind_arg_offs[3+blockId*4]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg0_size*sizeof(float)*2); ind_arg1_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg1_size*sizeof(float)*4); ind_arg2_s = (float *) &shared[nbytes]; nbytes += ROUND_UP(ind_arg2_size*sizeof(float)*1); ind_arg3_s = (float *) &shared[nbytes]; } __syncthreads(); // make sure all of above completed // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; for (int n=0; n<ind_arg1_size; n++) for (int d=0; d<4; d++) ind_arg1_s[d+n*4] = ind_arg1[d+ind_arg1_map[n]*4]; for (int n=0; n<ind_arg2_size; n++) for (int d=0; d<1; d++) ind_arg2_s[d+n*1] = ind_arg2[d+ind_arg2_map[n]*1]; for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3_s[d+n*4] = ZERO_float; __syncthreads(); // process set elements for (int n=0; n<nelems2; n++) { int col2 = -1; if (n<nelem) { // initialise local variables for (int d=0; d<4; d++) arg4_l[d] = ZERO_float; // user-supplied kernel call bres_calc( ind_arg0_s+arg0_maps[n+offset_b]*2, ind_arg0_s+arg1_maps[n+offset_b]*2, ind_arg1_s+arg2_maps[n+offset_b]*4, ind_arg2_s+arg3_maps[n+offset_b]*1, arg4_l, arg5+(n+offset_b)*1 ); col2 = colors[n+offset_b]; } // store local variables int arg4_map = arg4_maps[n+offset_b]; for (int col=0; col<ncolor; col++) { if (col2==col) { for (int d=0; d<4; d++) ind_arg3_s[d+arg4_map*4] += arg4_l[d]; } __syncthreads(); } } // apply pointered write/increment for (int n=0; n<ind_arg3_size; n++) for (int d=0; d<4; d++) ind_arg3[d+ind_arg3_map[n]*4] += ind_arg3_s[d+n*4]; }
// host stub function void op_par_loop_bres_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(3); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: bres_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg2.map_data[n * arg2.map->dim + 0]; bres_calc( &((float*)arg0.data)[2 * map0idx], &((float*)arg0.data)[2 * map1idx], &((float*)arg2.data)[4 * map2idx], &((float*)arg3.data)[1 * map2idx], &((float*)arg4.data)[4 * map2idx], &((int*)arg5.data)[1 * n]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[3].name = name; OP_kernels[3].count += 1; OP_kernels[3].time += wall_t2 - wall_t1; OP_kernels[3].transfer += (float)set->size * arg0.size; OP_kernels[3].transfer += (float)set->size * arg2.size; OP_kernels[3].transfer += (float)set->size * arg3.size; OP_kernels[3].transfer += (float)set->size * arg4.size * 2.0f; OP_kernels[3].transfer += (float)set->size * arg5.size; OP_kernels[3].transfer += (float)set->size * arg0.map->dim * 4.0f; OP_kernels[3].transfer += (float)set->size * arg2.map->dim * 4.0f; }
// host stub function void op_par_loop_bres_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5) { int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(3); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[3].name = name; OP_kernels[3].count += 1; int ninds = 4; int inds[6] = {0, 0, 1, 2, 3, -1}; if (OP_diags > 2) { printf(" kernel routine with indirection: bres_calc\n"); } // get plan #ifdef OP_PART_SIZE_3 int part_size = OP_PART_SIZE_3; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size > 0) { // Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; int *map2 = arg2.map_data_d; int *data5 = (int *)arg5.data_d; double *data0 = (double *)arg0.data_d; double *data2 = (double *)arg2.data_d; double *data3 = (double *)arg3.data_d; double *data4 = (double *)arg4.data_d; op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for (int col = 0; col < Plan->ncolors; col++) { if (col == 1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col + 1]; #pragma acc parallel loop independent deviceptr(col_reord, map0, map2, data5, \ data0, data2, data3, data4) for (int e = start; e < end; e++) { int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map2[n + set_size1 * 0]; bres_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data2[4 * map2idx], &data3[1 * map2idx], &data4[4 * map2idx], &data5[1 * n]); } } OP_kernels[3].transfer += Plan->transfer; OP_kernels[3].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[3].time += wall_t2 - wall_t1; }