// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_1 int nthread = OP_BLOCK_SIZE_1; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; float* data4 = (float*)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; float* data5 = (float*)arg5.data_d; int dat5size = getSetSizeFromOpArg(&arg5) * arg5.dat->dim; float *data0 = (float *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; adt_calc_omp4_kernel(map0, map0size, data4, dat4size, data5, dat5size, data0, dat0size, col_reord, set_size1, start, end, part_size != 0 ? (end - start - 1) / part_size + 1 : (end - start - 1) / nthread, nthread); } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_0 int nthread = OP_BLOCK_SIZE_0; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; double* data8 = (double*)arg8.data_d; int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim; double *data0 = (double *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double *data4 = (double *)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; double *data9 = (double *)arg9.data_d; int dat9size = (arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim; double *data13 = (double *)arg13.data_d; int dat13size = (arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; res_calc_omp4_kernel( map0, map0size, data8, dat8size, data0, dat0size, data4, dat4size, data9, dat9size, data13, dat13size, col_reord, set_size1, start, end, part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread, nthread, opDat0_res_calc_stride_OP2CONSTANT, direct_res_calc_stride_OP2CONSTANT); } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5) { int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0, 0, 0, 0, -1, -1}; if (OP_diags > 2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size > 0) { // Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; float *data4 = (float *)arg4.data_d; float *data5 = (float *)arg5.data_d; float *data0 = (float *)arg0.data_d; op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for (int col = 0; col < Plan->ncolors; col++) { if (col == 1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col + 1]; #pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \ data0) for (int e = start; e < end; e++) { int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]); } } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; double* data8 = (double*)arg8.data_d; double *data0 = (double *)arg0.data_d; double *data4 = (double *)arg4.data_d; double *data9 = (double *)arg9.data_d; double *data13 = (double *)arg13.data_d; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; #pragma acc parallel loop independent deviceptr(col_reord,map0,data8,data0,data4,data9,data13) for ( int e=start; e<end; e++ ){ int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; const double* arg0_vec[] = { &data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx]}; const double* arg4_vec[] = { &data4[1 * map0idx], &data4[1 * map1idx], &data4[1 * map2idx], &data4[1 * map3idx]}; double* arg9_vec[] = { &data9[1 * map0idx], &data9[1 * map1idx], &data9[1 * map2idx], &data9[1 * map3idx]}; double* arg13_vec[] = { &data13[2 * map0idx], &data13[2 * map1idx], &data13[2 * map2idx], &data13[2 * map3idx]}; res_calc( arg0_vec, arg4_vec, &data8[n], arg9_vec, arg13_vec); } } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }