// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((double*)arg0.data)[1 * n], &((double*)arg1.data)[1 * map1idx], &((double*)arg2.data)[1 * map2idx], (double*)arg3.data); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size; OP_kernels[0].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg1.map->dim * 4.0f; }
void save_soln_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int sharedMemoryOffset; double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); blocksPerGrid = 200; threadsPerBlock = threadsPerBlockSize_save_soln; totalThreadNumber = threadsPerBlock * blocksPerGrid; dynamicSharedMemorySize = 0; dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); sharedMemoryOffset = dynamicSharedMemorySize * OP_WARPSIZE; dynamicSharedMemorySize = dynamicSharedMemorySize * threadsPerBlock; kernelPointer = getKernel("save_soln_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat2.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(int ),&sharedMemoryOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(int ),&set -> size); //errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(size_t ),&dynamicSharedMemorySize); errorCode = errorCode | clSetKernelArg(kernelPointer,4,dynamicSharedMemorySize,NULL); //printf("errorCode after 5: %d\n", errorCode); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments save_soln"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel save_soln"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); #ifdef PROFILE unsigned long tqueue, tsubmit, tstart, tend, telapsed; ciErrNum = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" ); OP_kernels[0].queue_time += (tsubmit - tqueue); OP_kernels[0].wait_time += (tstart - tsubmit); OP_kernels[0].execution_time += (tend - tstart); //printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend); //printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 ); #endif // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = userSubroutine; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * opDat1.size; OP_kernels[0].transfer += (float)set->size * opDat2.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { double *arg4h = (double *)arg4.data; int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[4].name = name; OP_kernels[4].count += 1; if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); double arg4_l = arg4h[0]; if (set->size > 0) { // Set up typed device pointers for OpenACC double *data0 = (double *)arg0.data_d; double *data1 = (double *)arg1.data_d; double *data2 = (double *)arg2.data_d; double *data3 = (double *)arg3.data_d; #pragma acc parallel loop independent deviceptr(data0, data1, data2, \ data3) reduction(+ : arg4_l) for (int n = 0; n < set->size; n++) { update(&data0[4 * n], &data1[4 * n], &data2[4 * n], &data3[1 * n], &arg4_l); } } // combine reduction data arg4h[0] = arg4_l; op_mpi_reduce_double(&arg4, arg4h); op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int ninds = 0; int nargs = 2; op_arg args[2] = {arg0,arg1}; if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_save_soln( (double *) arg0.data, (double *) arg1.data, start, finish ); } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (double)set->size * arg0.size; OP_kernels[0].transfer += (double)set->size * arg1.size; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ update( &((float*)arg0.data)[1*n], &((float*)arg1.data)[1*n], &((float*)arg2.data)[1*n], (float*)arg3.data, (float*)arg4.data); } } // combine reduction data op_mpi_reduce_float(&arg3,(float*)arg3.data); op_mpi_reduce_float(&arg4,(float*)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size; OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[1].transfer += (float)set->size * arg2.size * 2.0f; }
// host stub function void op_par_loop_save_soln_cpu(char const *name, op_set set, op_arg arg0, op_arg arg1){ int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif if (set->size >0) { // execute plan #pragma omp parallel for for ( int thr=0; thr<nthreads; thr++ ){ int start = (set->size* thr)/nthreads; int finish = (set->size*(thr+1))/nthreads; for ( int n=start; n<finish; n++ ){ save_soln( &((double*)arg0.data)[4*n], &((double*)arg1.data)[4*n]); } } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; }
void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ) { if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_save_soln( (float *) arg0.data, (float *) arg1.data, start, finish ); } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ) { int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg2.data, (double *)arg4.data, (double *)arg6.data, Plan->ind_map, Plan->loc_map, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(2); OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; const double* arg0_vec[] = { &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx]}; const double* arg4_vec[] = { &((double*)arg4.data)[1 * map0idx], &((double*)arg4.data)[1 * map1idx], &((double*)arg4.data)[1 * map2idx], &((double*)arg4.data)[1 * map3idx]}; double* arg9_vec[] = { &((double*)arg9.data)[1 * map0idx], &((double*)arg9.data)[1 * map1idx], &((double*)arg9.data)[1 * map2idx], &((double*)arg9.data)[1 * map3idx]}; res_calc( arg0_vec, arg4_vec, &((double*)arg8.data)[16 * n], arg9_vec); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg4.size; OP_kernels[0].transfer += (float)set->size * arg9.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg8.size; OP_kernels[0].transfer += (float)set->size * arg0.map->dim * 4.0f; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9 ){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 4; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for (int v = 1; v < 4; v++) { args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for (int v = 1; v < 4; v++) { args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } int ninds = 3; int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg4.data, (double *)arg9.data, Plan->ind_map, Plan->loc_map, (double *)arg8.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // create aligned pointers for dats ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data; __assume_aligned(ptr0, double_ALIGN); ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data; __assume_aligned(ptr1, double_ALIGN); ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data; __assume_aligned(ptr2, double_ALIGN); ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data; __assume_aligned(ptr3, double_ALIGN); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } int exec_size = op_mpi_halo_exchanges(set, nargs, args); if (exec_size > 0) { #ifdef VECTORIZE #pragma novector for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) { double dat4[SIMD_VEC] = {0.0}; #pragma simd for (int i = 0; i < SIMD_VEC; i++) { update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)], &(ptr3)[1 * (n + i)], &dat4[i]); } for (int i = 0; i < SIMD_VEC; i++) { *(double *)arg4.data += dat4[i]; } } // remainder for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) { #else for (int n = 0; n < exec_size; n++) { #endif update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n], (double *)arg4.data); } } // combine reduction data op_mpi_reduce(&arg4, (double *)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_0 int nthread = OP_BLOCK_SIZE_0; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; double* data8 = (double*)arg8.data_d; int dat8size = (arg8.opt?1:0) * getSetSizeFromOpArg(&arg8) * arg8.dat->dim; double *data0 = (double *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double *data4 = (double *)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; double *data9 = (double *)arg9.data_d; int dat9size = (arg9.opt ? 1 : 0) * getSetSizeFromOpArg(&arg9) * arg9.dat->dim; double *data13 = (double *)arg13.data_d; int dat13size = (arg13.opt ? 1 : 0) * getSetSizeFromOpArg(&arg13) * arg13.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; res_calc_omp4_kernel( map0, map0size, data8, dat8size, data0, dat0size, data4, dat4size, data9, dat9size, data13, dat13size, col_reord, set_size1, start, end, part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread, nthread, opDat0_res_calc_stride_OP2CONSTANT, direct_res_calc_stride_OP2CONSTANT); } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_dotR(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ double *arg1h = (double *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; if (OP_diags>2) { printf(" kernel routine w/o indirection: dotR\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(6); OP_kernels[6].name = name; OP_kernels[6].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_double; if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_dotR( (double *) arg0.data, arg1_l + thr*64, start, finish ); } } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[6].time += wall_t2 - wall_t1; OP_kernels[6].transfer += (float)set->size * arg0.size; }
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; int sent[6] = {0,0,0,0,0,0}; if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[0] = exchange_halo(args[i]); if(sent[0] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc \n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_adt_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], (double *)arg4.data, (double *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5) { int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0, 0, 0, 0, -1, -1}; if (OP_diags > 2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size > 0) { // Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; float *data4 = (float *)arg4.data_d; float *data5 = (float *)arg5.data_d; float *data0 = (float *)arg0.data_d; op_plan *Plan = op_plan_get_stage(name, set, part_size, nargs, args, ninds, inds, OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for (int col = 0; col < Plan->ncolors; col++) { if (col == 1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col + 1]; #pragma acc parallel loop independent deviceptr(col_reord, map0, data4, data5, \ data0) for (int e = start; e < end; e++) { int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; adt_calc(&data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx], &data4[4 * n], &data5[1 * n]); } } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ double*arg3h = (double *)arg3.data; int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(8); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[8].name = name; OP_kernels[8].count += 1; if (OP_diags>2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_8 int part_size = OP_PART_SIZE_8; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_8 int nthread = OP_BLOCK_SIZE_8; #else int nthread = OP_block_size; #endif double arg3_l = arg3h[0]; if (set->size >0) { //Set up typed device pointers for OpenMP double* data0 = (double*)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; double* data1 = (double*)arg1.data_d; int dat1size = getSetSizeFromOpArg(&arg1) * arg1.dat->dim; double* data2 = (double*)arg2.data_d; int dat2size = getSetSizeFromOpArg(&arg2) * arg2.dat->dim; update_omp4_kernel( data0, dat0size, data1, dat1size, data2, dat2size, &arg3_l, set->size, part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread, nthread); } // combine reduction data arg3h[0] = arg3_l; op_mpi_reduce_double(&arg3,arg3h); op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[8].time += wall_t2 - wall_t1; OP_kernels[8].transfer += (float)set->size * arg0.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[8].transfer += (float)set->size * arg2.size; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, (float *)arg2.data, Plan->ind_map, Plan->loc_map, (float *)arg0.data, (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int ninds = 0; int nargs = 5; op_arg args[5] = {arg0,arg1,arg2,arg3,arg4}; double *arg4h = (double *)arg4.data; if (OP_diags>2) { printf(" kernel routine w/o indirection: update \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction double arg4_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double; // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_update( (double *) arg0.data, (double *) arg1.data, (double *) arg2.data, (double *) arg3.data, arg4_l + thr*64, start, finish ); } // combine reduction data for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64]; //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_GBL) global_reduce(&args[i]); // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(4); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (double)set->size * arg0.size; OP_kernels[4].transfer += (double)set->size * arg1.size; OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (double)set->size * arg3.size; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[1].name = name; OP_kernels[1].count += 1; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } // get plan int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif #ifdef OP_BLOCK_SIZE_1 int nthread = OP_BLOCK_SIZE_1; #else int nthread = OP_block_size; #endif int ncolors = 0; int set_size1 = set->size + set->exec_size; if (set->size >0) { //Set up typed device pointers for OpenMP int *map0 = arg0.map_data_d; int map0size = arg0.map->dim * set_size1; float* data4 = (float*)arg4.data_d; int dat4size = getSetSizeFromOpArg(&arg4) * arg4.dat->dim; float* data5 = (float*)arg5.data_d; int dat5size = getSetSizeFromOpArg(&arg5) * arg5.dat->dim; float *data0 = (float *)arg0.data_d; int dat0size = getSetSizeFromOpArg(&arg0) * arg0.dat->dim; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; adt_calc_omp4_kernel(map0, map0size, data4, dat4size, data5, dat5size, data0, dat0size, col_reord, set_size1, start, end, part_size != 0 ? (end - start - 1) / part_size + 1 : (end - start - 1) / nthread, nthread); } OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); if (OP_diags>1) deviceSync(); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].time += wall_t2 - wall_t1; }
void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6 ){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; arg6.idx = 0; args[6] = arg6; for (int v = 1; v < 2; v++) { args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ); } int ninds = 2; int inds[8] = {0,0,-1,-1,-1,-1,1,1}; if (OP_diags>2) { printf(" kernel routine with indirection: SpaceDiscretization\n"); } // get plan #ifdef OP_PART_SIZE_18 int part_size = OP_PART_SIZE_18; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(18); OP_kernels[18].name = name; OP_kernels[18].count += 1; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_SpaceDiscretization( blockIdx, (float *)arg0.data, (float *)arg6.data, Plan->ind_map, Plan->loc_map, (float *)arg2.data, (float *)arg3.data, (float *)arg4.data, (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(18); OP_kernels[18].transfer += Plan->transfer; OP_kernels[18].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[18].time += wall_t2 - wall_t1; }
void bres_calc_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2,op_arg opDat3,op_arg opDat4,op_arg opDat5,op_arg opDat6) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int i3; op_arg opDatArray[6]; int indirectionDescriptorArray[6]; op_plan *planRet; int blockOffset; opDatArray[0] = opDat1; opDatArray[1] = opDat2; opDatArray[2] = opDat3; opDatArray[3] = opDat4; opDatArray[4] = opDat5; opDatArray[5] = opDat6; indirectionDescriptorArray[0] = 0; indirectionDescriptorArray[1] = 0; indirectionDescriptorArray[2] = 1; indirectionDescriptorArray[3] = 2; indirectionDescriptorArray[4] = 3; indirectionDescriptorArray[5] = -1; planRet = op_plan_get(userSubroutine,set,setPartitionSize_bres_calc,6,opDatArray,4,indirectionDescriptorArray); cl_mem gm1_d; gm1_d = op_allocate_constant(&gm1,sizeof(float )); cl_mem qinf_d; qinf_d = op_allocate_constant(&qinf,4 * sizeof(float)); cl_mem eps_d; eps_d = op_allocate_constant(&eps,sizeof(float )); blockOffset = 0; double cpu_t1; double cpu_t2; double wall_t1; op_timers(&cpu_t1, &wall_t1); double wall_t2; for (i3 = 0; i3 < planRet -> ncolors; ++i3) { blocksPerGrid = planRet -> ncolblk[i3]; dynamicSharedMemorySize = planRet -> nshared; threadsPerBlock = threadsPerBlockSize_bres_calc; totalThreadNumber = threadsPerBlock * blocksPerGrid; kernelPointer = getKernel("bres_calc_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat3.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(cl_mem ),&opDat4.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(cl_mem ),&opDat5.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(cl_mem ),&opDat6.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,5,sizeof(cl_mem ),&planRet -> ind_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,6,sizeof(cl_mem ),&planRet -> ind_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,7,sizeof(cl_mem ),&planRet -> ind_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,8,sizeof(cl_mem ),&planRet -> ind_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,9,sizeof(cl_mem ),&planRet -> loc_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,10,sizeof(cl_mem ),&planRet -> loc_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,11,sizeof(cl_mem ),&planRet -> loc_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,12,sizeof(cl_mem ),&planRet -> loc_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,13,sizeof(cl_mem ),&planRet -> loc_maps[4]); errorCode = errorCode | clSetKernelArg(kernelPointer,14,sizeof(cl_mem ),&planRet -> ind_sizes); errorCode = errorCode | clSetKernelArg(kernelPointer,15,sizeof(cl_mem ),&planRet -> ind_offs); errorCode = errorCode | clSetKernelArg(kernelPointer,16,sizeof(cl_mem ),&planRet -> blkmap); errorCode = errorCode | clSetKernelArg(kernelPointer,17,sizeof(cl_mem ),&planRet -> offset); errorCode = errorCode | clSetKernelArg(kernelPointer,18,sizeof(cl_mem ),&planRet -> nelems); errorCode = errorCode | clSetKernelArg(kernelPointer,19,sizeof(cl_mem ),&planRet -> nthrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,20,sizeof(cl_mem ),&planRet -> thrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,21,sizeof(int ),&blockOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,22,dynamicSharedMemorySize,NULL); errorCode = errorCode | clSetKernelArg(kernelPointer,23,sizeof(cl_mem ),&gm1_d); errorCode = errorCode | clSetKernelArg(kernelPointer,24,sizeof(cl_mem ),&qinf_d); errorCode = errorCode | clSetKernelArg(kernelPointer,25,sizeof(cl_mem ),&eps_d); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); blockOffset += blocksPerGrid; } op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[1].name = userSubroutine; OP_kernels[1].count = OP_kernels[1].count + 1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // combine reduction data // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; }
// host stub function void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3){ int nargs = 4; op_arg args[4]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map1idx = arg1.map_data[n * arg1.map->dim + 1]; int map2idx = arg1.map_data[n * arg1.map->dim + 0]; res( &((float*)arg0.data)[1 * n], &((float*)arg1.data)[1 * map1idx], &((float*)arg2.data)[1 * map2idx], (float*)arg3.data); } } block_offset += nblocks; } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4 ){ int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; if (OP_diags>2) { printf(" kernel routine w/o indirection: EvolveValuesRK2_1\n"); } op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif if (set->size >0) { op_timers_core(&cpu_t1, &wall_t1); // execute plan #pragma omp parallel for for (int thr=0; thr<nthreads; thr++) { int start = (set->size* thr )/nthreads; int finish = (set->size*(thr+1))/nthreads; op_x86_EvolveValuesRK2_1( (float *) arg0.data, (float *) arg1.data, (float *) arg2.data, (float *) arg3.data, (float *) arg4.data, start, finish ); } } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[0].transfer += (float)set->size * arg2.size; OP_kernels[0].transfer += (float)set->size * arg3.size; OP_kernels[0].transfer += (float)set->size * arg4.size; }
//#define AUTO_BLOCK_SIZE void op_par_loop_save_soln(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ cl_int ciErrNum; cl_event ceEvent; if (OP_diags>2) { printf(" kernel routine w/o indirection: save_soln \n"); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set CUDA execution parameters #ifdef AUTO_BLOCK_SIZE const size_t nthread = 1024; #else #ifdef OP_BLOCK_SIZE_0 const size_t nthread = OP_BLOCK_SIZE_0; #else // int nthread = OP_block_size; const size_t nthread = 128; #endif #endif const size_t nblocks = 200; const size_t n_tot_thread = nblocks * nthread; // work out shared memory requirements per element int nshared = 0; nshared = MAX(nshared,sizeof(float)*4); nshared = MAX(nshared,sizeof(float)*4); // execute plan int offset_s = nshared*OP_WARPSIZE; nshared = nshared*nthread; cl_kernel hKernel = getKernel( "op_cuda_save_soln" ); //nshared *= 4; //offset_s *= 4; int i = 0; ciErrNum = clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg0.data_d) ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(cl_mem), &(arg1.data_d) ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &offset_s ); ciErrNum |= clSetKernelArg( hKernel, i++, sizeof(int), &set->size ); ciErrNum |= clSetKernelArg( hKernel, i++, nshared, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error setting kernel arguments" ); #ifdef AUTO_BLOCK_SIZE ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, NULL, 0, NULL, &ceEvent ); #else ciErrNum = clEnqueueNDRangeKernel( cqCommandQueue, hKernel, 1, NULL, &n_tot_thread, &nthread, 0, NULL, &ceEvent ); #endif assert_m( ciErrNum == CL_SUCCESS, "error executing kernel" ); #ifndef ASYNC ciErrNum = clFinish( cqCommandQueue ); assert_m( ciErrNum == CL_SUCCESS, "error completing device commands" ); #ifdef PROFILE unsigned long tqueue, tsubmit, tstart, tend, telapsed; ciErrNum = clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(tqueue), &tqueue, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(tsubmit), &tsubmit, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL ); ciErrNum |= clGetEventProfilingInfo( ceEvent, CL_PROFILING_COMMAND_END, sizeof(tend), &tend, NULL ); assert_m( ciErrNum == CL_SUCCESS, "error getting profiling info" ); OP_kernels[0].queue_time += (tsubmit - tqueue); OP_kernels[0].wait_time += (tstart - tsubmit); OP_kernels[0].execution_time += (tend - tstart); //printf("%20lu\n%20lu\n%20lu\n%20lu\n\n", tqueue, tsubmit, tstart, tend); //printf("queue: %8.4f\nwait:%8.4f\nexec: %8.4f\n\n", OP_kernels[0].queue_time * 1.0e-9, OP_kernels[0].wait_time * 1.0e-9, OP_kernels[0].execution_time * 1.0e-9 ); #endif // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += (float)set->size * arg0.size; OP_kernels[0].transfer += (float)set->size * arg1.size; #endif }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1) { int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } op_mpi_halo_exchanges(set, nargs, args); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads(); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[nthreads * 64]; for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1_l[d + thr * 64] = ZERO_int; } } if (set->size > 0) { // execute plan #pragma omp parallel for for (int thr = 0; thr < nthreads; thr++) { int start = (set->size * thr) / nthreads; int finish = (set->size * (thr + 1)) / nthreads; for (int n = start; n < finish; n++) { update(&((double *)arg0.data)[4 * n], &arg1_l[64 * omp_get_thread_num()]); } } } // combine reduction data for (int thr = 0; thr < nthreads; thr++) { for (int d = 0; d < 1; d++) { arg1h[d] += arg1_l[d + thr * 64]; } } op_mpi_reduce(&arg1, arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += (float)set->size * arg0.size * 2.0f; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, Plan->ind_maps[0], (float *)arg2.data, Plan->ind_maps[1], (float *)arg0.data, Plan->loc_maps[1], Plan->loc_maps[2], (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; }
// host stub function void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(1); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc\n"); } int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { for ( int n=0; n<set_size; n++ ){ if (n==set->core_size) { op_mpi_wait_all(nargs, args); } int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg0.map_data[n * arg0.map->dim + 2]; int map3idx = arg0.map_data[n * arg0.map->dim + 3]; adt_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg0.data)[2 * map2idx], &((double*)arg0.data)[2 * map3idx], &((double*)arg4.data)[4 * n], &((double*)arg5.data)[1 * n]); } } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; }
// host stub function void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9, op_arg arg13){ int nargs = 17; op_arg args[17]; arg0.idx = 0; args[0] = arg0; for ( int v=1; v<4; v++ ){ args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for ( int v=1; v<4; v++ ){ args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for ( int v=1; v<4; v++ ){ args[9 + v] = op_opt_arg_dat(arg9.opt, arg9.dat, v, arg9.map, 1, "double", OP_RW); } arg13.idx = 0; args[13] = arg13; for ( int v=1; v<4; v++ ){ args[13 + v] = op_opt_arg_dat(arg13.opt, arg13.dat, v, arg13.map, 2, "double", OP_INC); } // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(0); op_timers_core(&cpu_t1, &wall_t1); OP_kernels[0].name = name; OP_kernels[0].count += 1; int ninds = 4; int inds[17] = {0,0,0,0,1,1,1,1,-1,2,2,2,2,3,3,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args); int ncolors = 0; if (set->size >0) { if ((OP_kernels[0].count==1) || (opDat0_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg0))) { opDat0_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg0); opDat0_res_calc_stride_OP2CONSTANT = opDat0_res_calc_stride_OP2HOST; } if ((OP_kernels[0].count==1) || (direct_res_calc_stride_OP2HOST != getSetSizeFromOpArg(&arg8))) { direct_res_calc_stride_OP2HOST = getSetSizeFromOpArg(&arg8); direct_res_calc_stride_OP2CONSTANT = direct_res_calc_stride_OP2HOST; } //Set up typed device pointers for OpenACC int *map0 = arg0.map_data_d; double* data8 = (double*)arg8.data_d; double *data0 = (double *)arg0.data_d; double *data4 = (double *)arg4.data_d; double *data9 = (double *)arg9.data_d; double *data13 = (double *)arg13.data_d; op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2); ncolors = Plan->ncolors; int *col_reord = Plan->col_reord; int set_size1 = set->size + set->exec_size; // execute plan for ( int col=0; col<Plan->ncolors; col++ ){ if (col==1) { op_mpi_wait_all_cuda(nargs, args); } int start = Plan->col_offsets[0][col]; int end = Plan->col_offsets[0][col+1]; #pragma acc parallel loop independent deviceptr(col_reord,map0,data8,data0,data4,data9,data13) for ( int e=start; e<end; e++ ){ int n = col_reord[e]; int map0idx = map0[n + set_size1 * 0]; int map1idx = map0[n + set_size1 * 1]; int map2idx = map0[n + set_size1 * 2]; int map3idx = map0[n + set_size1 * 3]; const double* arg0_vec[] = { &data0[2 * map0idx], &data0[2 * map1idx], &data0[2 * map2idx], &data0[2 * map3idx]}; const double* arg4_vec[] = { &data4[1 * map0idx], &data4[1 * map1idx], &data4[1 * map2idx], &data4[1 * map3idx]}; double* arg9_vec[] = { &data9[1 * map0idx], &data9[1 * map1idx], &data9[1 * map2idx], &data9[1 * map3idx]}; double* arg13_vec[] = { &data13[2 * map0idx], &data13[2 * map1idx], &data13[2 * map2idx], &data13[2 * map3idx]}; res_calc( arg0_vec, arg4_vec, &data8[n], arg9_vec, arg13_vec); } } OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size || ncolors == 1) { op_mpi_wait_all_cuda(nargs, args); } // combine reduction data op_mpi_set_dirtybit_cuda(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }