// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; adt_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx], &((double*)arg4.data)[4 * n], &((double*)arg5.data)[1 * n]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; }
void op_x86_adt_calc( int blockIdx, double *ind_arg0, int *ind_arg0_maps, short *arg0_maps, short *arg1_maps, short *arg2_maps, short *arg3_maps, double *arg4, double *arg5, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors) { int *ind_arg0_map, ind_arg0_size; double *ind_arg0_s; int nelem, offset_b; char shared[64000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*1]; ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*1]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; } __syncthreads(); // make sure all of above completed // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; __syncthreads(); // process set elements for (int n=0; n<nelem; n++) { // user-supplied kernel call adt_calc( ind_arg0_s+arg0_maps[n+offset_b]*2, ind_arg0_s+arg1_maps[n+offset_b]*2, ind_arg0_s+arg2_maps[n+offset_b]*2, ind_arg0_s+arg3_maps[n+offset_b]*2, arg4+(n+offset_b)*4, arg5+(n+offset_b)*1 ); } }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; adt_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx], &((double*)arg4.data)[4 * n], &((double*)arg5.data)[1 * n]); } } block_offset += nblocks; } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; }
void op_x86_adt_calc( int blockIdx, float *ind_arg0, int *ind_arg0_maps, short *arg0_maps, short *arg1_maps, short *arg2_maps, short *arg3_maps, float *arg4, float *arg5, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors) { float *arg0_vec[4]; int *ind_arg0_map, ind_arg0_size; float *ind_arg0_s; int nelem, offset_b; char shared[64000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*1]; ind_arg0_map = ind_arg0_maps + ind_arg_offs[0+blockId*1]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (float *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; // process set elements for (int n=0; n<nelem; n++) { arg0_vec[0] = ind_arg0_s+arg0_maps[n+offset_b]*2; arg0_vec[1] = ind_arg0_s+arg1_maps[n+offset_b]*2; arg0_vec[2] = ind_arg0_s+arg2_maps[n+offset_b]*2; arg0_vec[3] = ind_arg0_s+arg3_maps[n+offset_b]*2; // user-supplied kernel call adt_calc( arg0_vec, arg4+(n+offset_b)*4, arg5+(n+offset_b)*1 ); } }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5) { int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0, 0, 0, 0, -1, -1}; if (OP_diags > 2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size > 0) { // Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; float *data4 = (float *)arg4.data_d; float *data5 = (float *)arg5.data_d; float *data0 = (float *)arg0.data_d; op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for (int col = 0; col < Plan->ncolors; col++) { if (col == 1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col + 1]; #pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \ data0) for (int e = start; e < end; e++) { int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]); } } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
void op_x86_adt_calc( int blockIdx, double *ind_arg0, int *ind_map, short *arg_map, double *arg4, double *arg5, int *ind_arg_sizes, int *ind_arg_offs, int block_offset, int *blkmap, int *offset, int *nelems, int *ncolors, int *colors, int set_size) { int *ind_arg0_map, ind_arg0_size; double *ind_arg0_s; int nelem, offset_b; char shared[128000]; if (0==0) { // get sizes and shift pointers and direct-mapped data int blockId = blkmap[blockIdx + block_offset]; nelem = nelems[blockId]; offset_b = offset[blockId]; ind_arg0_size = ind_arg_sizes[0+blockId*1]; ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1]; // set shared memory pointers int nbytes = 0; ind_arg0_s = (double *) &shared[nbytes]; } // copy indirect datasets into shared memory or zero increment for (int n=0; n<ind_arg0_size; n++) for (int d=0; d<2; d++) ind_arg0_s[d+n*2] = ind_arg0[d+ind_arg0_map[n]*2]; // process set elements for (int n=0; n<nelem; n++) { // user-supplied kernel call adt_calc( ind_arg0_s+arg_map[0*set_size+n+offset_b]*2, ind_arg0_s+arg_map[1*set_size+n+offset_b]*2, ind_arg0_s+arg_map[2*set_size+n+offset_b]*2, ind_arg0_s+arg_map[3*set_size+n+offset_b]*2, arg4+(n+offset_b)*4, arg5+(n+offset_b)*1 ); } }