// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { double *arg4h = (double *)arg4.data; int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[4].name = name; OP_kernels[4].count += 1; if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); double arg4_l = arg4h[0]; if (set->size > 0) { // Set up typed device pointers for OpenACC double *data0 = (double *)arg0.data_d; double *data1 = (double *)arg1.data_d; double *data2 = (double *)arg2.data_d; double *data3 = (double *)arg3.data_d; #pragma acc parallel loop independent deviceptr(data0, data1, data2, \ data3) reduction(+ : arg4_l) for (int n = 0; n < set->size; n++) { update(&data0[4 * n], &data1[4 * n], &data2[4 * n], &data3[1 * n], &arg4_l); } } // combine reduction data arg4h[0] = arg4_l; op_mpi_reduce_double(&arg4, arg4h); op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ double*arg3h = (double *)arg3.data; int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(8); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[8].name = name; OP_kernels[8].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_8 int part_size = OP_PART_SIZE_8; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_8 int nthread = OP_BLOCK_SIZE_8; #else int nthread = OP_block_size; #endif double arg3_l = arg3h[0]; if (set->size >0) { //Set up typed device pointers for OpenMP double* data0 = (double*)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double* data1 = (double*)arg1.data_d; int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim; double* data2 = (double*)arg2.data_d; int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim; update_omp4_kernel( data0, dat0size, data1, dat1size, data2, dat2size, &arg3_l, set->size, part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread, nthread); } // combine reduction data arg3h[0] = arg3_l; op_mpi_reduce_double(&arg3,arg3h); op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[8].time += wall_t2 - wall_t1; OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg2.size; }