// host stub function void op_par_loop_bres_calc_cpu(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5){ int nargs = 6; op_arg args[6]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(3); op_timers_core(&cpu_t1, &wall_t1); int ninds = 4; int inds[6] = {0,0,1,2,3,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: bres_calc\n"); } // get plan #ifdef OP_PART_SIZE_3 int part_size = OP_PART_SIZE_3; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for ( int col=0; col<Plan->ncolors; col++ ){ if (col==Plan->ncolors_core) { op_mpi_wait_all(nargs, args); } int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for ( int blockIdx=0; blockIdx<nblocks; blockIdx++ ){ int blockId = Plan->blkmap[blockIdx + block_offset]; int nelem = Plan->nelems[blockId]; int offset_b = Plan->offset[blockId]; for ( int n=offset_b; n<offset_b+nelem; n++ ){ int map0idx = arg0.map_data[n * arg0.map->dim + 0]; int map1idx = arg0.map_data[n * arg0.map->dim + 1]; int map2idx = arg2.map_data[n * arg2.map->dim + 0]; bres_calc( &((double*)arg0.data)[2 * map0idx], &((double*)arg0.data)[2 * map1idx], &((double*)arg2.data)[4 * map2idx], &((double*)arg3.data)[1 * map2idx], &((double*)arg4.data)[4 * map2idx], &((int*)arg5.data)[1 * n]); } } block_offset += nblocks; } OP_kernels[3].transfer += Plan->transfer; OP_kernels[3].transfer2 += Plan->transfer2; } if (set_size == 0 || set_size == set->core_size) { op_mpi_wait_all(nargs, args); } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[3].name = name; OP_kernels[3].count += 1; OP_kernels[3].time += wall_t2 - wall_t1; }
void op_par_loop_adt_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5 ){ int nargs = 6; op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5}; int ninds = 1; int inds[6] = {0,0,0,0,-1,-1}; int sent[6] = {0,0,0,0,0,0}; if(ninds > 0) //indirect loop { for(int i = 0; i<nargs; i++) { if(args[i].argtype == OP_ARG_DAT) { if (OP_diags==1) reset_halo(args[i]); sent[0] = exchange_halo(args[i]); if(sent[0] == 1)wait_all(args[i]); } } } if (OP_diags>2) { printf(" kernel routine with indirection: adt_calc \n"); } // get plan #ifdef OP_PART_SIZE_1 int part_size = OP_PART_SIZE_1; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_adt_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], (double *)arg4.data, (double *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW for(int i = 0; i<nargs; i++) if(args[i].argtype == OP_ARG_DAT) set_dirtybit(args[i]); //performe any global operations // - NONE // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(1); OP_kernels[1].name = name; OP_kernels[1].count += 1; OP_kernels[1].time += wall_t2 - wall_t1; OP_kernels[1].transfer += Plan->transfer; OP_kernels[1].transfer2 += Plan->transfer2; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ){ int nargs = 8; op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7}; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc \n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_maps[0], (double *)arg2.data, Plan->ind_maps[1], (double *)arg4.data, Plan->ind_maps[2], (double *)arg6.data, Plan->ind_maps[3], Plan->loc_maps[0], Plan->loc_maps[1], Plan->loc_maps[2], Plan->loc_maps[3], Plan->loc_maps[4], Plan->loc_maps[5], Plan->loc_maps[6], Plan->loc_maps[7], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // combine reduction data // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6, op_arg arg7 ) { int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; args[6] = arg6; args[7] = arg7; int ninds = 4; int inds[8] = {0,0,1,1,2,2,3,3}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_2 int part_size = OP_PART_SIZE_2; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg2.data, (double *)arg4.data, (double *)arg6.data, Plan->ind_map, Plan->loc_map, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(2); OP_kernels[2].transfer += Plan->transfer; OP_kernels[2].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(2); OP_kernels[2].name = name; OP_kernels[2].count += 1; OP_kernels[2].time += wall_t2 - wall_t1; }
void op_par_loop_SpaceDiscretization(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4, op_arg arg5, op_arg arg6 ){ int nargs = 8; op_arg args[8]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; args[5] = arg5; arg6.idx = 0; args[6] = arg6; for (int v = 1; v < 2; v++) { args[6 + v] = op_arg_dat(arg6.dat, v, arg6.map, 1, "float", OP_READ); } int ninds = 2; int inds[8] = {0,0,-1,-1,-1,-1,1,1}; if (OP_diags>2) { printf(" kernel routine with indirection: SpaceDiscretization\n"); } // get plan #ifdef OP_PART_SIZE_18 int part_size = OP_PART_SIZE_18; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(18); OP_kernels[18].name = name; OP_kernels[18].count += 1; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_SpaceDiscretization( blockIdx, (float *)arg0.data, (float *)arg6.data, Plan->ind_map, Plan->loc_map, (float *)arg2.data, (float *)arg3.data, (float *)arg4.data, (int *)arg5.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(18); OP_kernels[18].transfer += Plan->transfer; OP_kernels[18].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[18].time += wall_t2 - wall_t1; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg4, op_arg arg8, op_arg arg9 ){ int nargs = 13; op_arg args[13]; arg0.idx = 0; args[0] = arg0; for (int v = 1; v < 4; v++) { args[0 + v] = op_arg_dat(arg0.dat, v, arg0.map, 2, "double", OP_READ); } arg4.idx = 0; args[4] = arg4; for (int v = 1; v < 4; v++) { args[4 + v] = op_arg_dat(arg4.dat, v, arg4.map, 1, "double", OP_READ); } args[8] = arg8; arg9.idx = 0; args[9] = arg9; for (int v = 1; v < 4; v++) { args[9 + v] = op_arg_dat(arg9.dat, v, arg9.map, 1, "double", OP_INC); } int ninds = 3; int inds[13] = {0,0,0,0,1,1,1,1,-1,2,2,2,2}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, (double *)arg4.data, (double *)arg9.data, Plan->ind_map, Plan->loc_map, (double *)arg8.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers(&cpu_t1, &wall_t1); // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, Plan->ind_maps[0], (float *)arg2.data, Plan->ind_maps[1], (float *)arg0.data, Plan->loc_maps[1], Plan->loc_maps[2], (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol); block_offset += nblocks; } // update kernel record op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; }
void op_par_loop_res_calc(char const *name, op_set set, op_arg arg0, op_arg arg1 ){ int *arg1h = (int *)arg1.data; int nargs = 2; op_arg args[2]; args[0] = arg0; args[1] = arg1; int ninds = 1; int inds[2] = {0,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res_calc\n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1=0, wall_t2=0; op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; // set number of threads #ifdef _OPENMP int nthreads = omp_get_max_threads( ); #else int nthreads = 1; #endif // allocate and initialise arrays for global reduction int arg1_l[1+64*64]; for (int thr=0; thr<nthreads; thr++) for (int d=0; d<1; d++) arg1_l[d+thr*64]=ZERO_int; if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); op_timers_core(&cpu_t1, &wall_t1); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res_calc( blockIdx, (double *)arg0.data, Plan->ind_map, Plan->loc_map, &arg1_l[64*omp_get_thread_num()], Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); // combine reduction data if (col == Plan->ncolors_owned-1) { for (int thr=0; thr<nthreads; thr++) for(int d=0; d<1; d++) arg1h[d] += arg1_l[d+thr*64]; } block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_reduce(&arg1,arg1h); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[0].time += wall_t2 - wall_t1; }
void op_par_loop_res(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3 ){ float *arg3h = (float *)arg3.data; int nargs = 4; op_arg args[4] = {arg0,arg1,arg2,arg3}; int ninds = 2; int inds[4] = {-1,0,1,-1}; if (OP_diags>2) { printf(" kernel routine with indirection: res \n"); } // get plan #ifdef OP_PART_SIZE_0 int part_size = OP_PART_SIZE_0; #else int part_size = OP_part_size; #endif int set_size = op_mpi_halo_exchanges(set, nargs, args); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timers_core(&cpu_t1, &wall_t1); if (set->size >0) { op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds); // execute plan int block_offset = 0; for (int col=0; col < Plan->ncolors; col++) { if (col==Plan->ncolors_core) op_mpi_wait_all(nargs, args); int nblocks = Plan->ncolblk[col]; #pragma omp parallel for for (int blockIdx=0; blockIdx<nblocks; blockIdx++) op_x86_res( blockIdx, (float *)arg1.data, (float *)arg2.data, Plan->ind_map, Plan->loc_map, (float *)arg0.data, (float *)arg3.data, Plan->ind_sizes, Plan->ind_offs, block_offset, Plan->blkmap, Plan->offset, Plan->nelems, Plan->nthrcol, Plan->thrcol, set_size); block_offset += nblocks; } op_timing_realloc(0); OP_kernels[0].transfer += Plan->transfer; OP_kernels[0].transfer2 += Plan->transfer2; } // combine reduction data op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[0].name = name; OP_kernels[0].count += 1; OP_kernels[0].time += wall_t2 - wall_t1; }
void bres_calc_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2,op_arg opDat3,op_arg opDat4,op_arg opDat5,op_arg opDat6) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int i3; op_arg opDatArray[6]; int indirectionDescriptorArray[6]; op_plan *planRet; int blockOffset; opDatArray[0] = opDat1; opDatArray[1] = opDat2; opDatArray[2] = opDat3; opDatArray[3] = opDat4; opDatArray[4] = opDat5; opDatArray[5] = opDat6; indirectionDescriptorArray[0] = 0; indirectionDescriptorArray[1] = 0; indirectionDescriptorArray[2] = 1; indirectionDescriptorArray[3] = 2; indirectionDescriptorArray[4] = 3; indirectionDescriptorArray[5] = -1; planRet = op_plan_get(userSubroutine,set,setPartitionSize_bres_calc,6,opDatArray,4,indirectionDescriptorArray); cl_mem gm1_d; gm1_d = op_allocate_constant(&gm1,sizeof(float )); cl_mem qinf_d; qinf_d = op_allocate_constant(&qinf,4 * sizeof(float)); cl_mem eps_d; eps_d = op_allocate_constant(&eps,sizeof(float )); blockOffset = 0; double cpu_t1; double cpu_t2; double wall_t1; op_timers(&cpu_t1, &wall_t1); double wall_t2; for (i3 = 0; i3 < planRet -> ncolors; ++i3) { blocksPerGrid = planRet -> ncolblk[i3]; dynamicSharedMemorySize = planRet -> nshared; threadsPerBlock = threadsPerBlockSize_bres_calc; totalThreadNumber = threadsPerBlock * blocksPerGrid; kernelPointer = getKernel("bres_calc_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat3.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(cl_mem ),&opDat4.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(cl_mem ),&opDat5.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(cl_mem ),&opDat6.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,5,sizeof(cl_mem ),&planRet -> ind_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,6,sizeof(cl_mem ),&planRet -> ind_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,7,sizeof(cl_mem ),&planRet -> ind_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,8,sizeof(cl_mem ),&planRet -> ind_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,9,sizeof(cl_mem ),&planRet -> loc_maps[0]); errorCode = errorCode | clSetKernelArg(kernelPointer,10,sizeof(cl_mem ),&planRet -> loc_maps[1]); errorCode = errorCode | clSetKernelArg(kernelPointer,11,sizeof(cl_mem ),&planRet -> loc_maps[2]); errorCode = errorCode | clSetKernelArg(kernelPointer,12,sizeof(cl_mem ),&planRet -> loc_maps[3]); errorCode = errorCode | clSetKernelArg(kernelPointer,13,sizeof(cl_mem ),&planRet -> loc_maps[4]); errorCode = errorCode | clSetKernelArg(kernelPointer,14,sizeof(cl_mem ),&planRet -> ind_sizes); errorCode = errorCode | clSetKernelArg(kernelPointer,15,sizeof(cl_mem ),&planRet -> ind_offs); errorCode = errorCode | clSetKernelArg(kernelPointer,16,sizeof(cl_mem ),&planRet -> blkmap); errorCode = errorCode | clSetKernelArg(kernelPointer,17,sizeof(cl_mem ),&planRet -> offset); errorCode = errorCode | clSetKernelArg(kernelPointer,18,sizeof(cl_mem ),&planRet -> nelems); errorCode = errorCode | clSetKernelArg(kernelPointer,19,sizeof(cl_mem ),&planRet -> nthrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,20,sizeof(cl_mem ),&planRet -> thrcol); errorCode = errorCode | clSetKernelArg(kernelPointer,21,sizeof(int ),&blockOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,22,dynamicSharedMemorySize,NULL); errorCode = errorCode | clSetKernelArg(kernelPointer,23,sizeof(cl_mem ),&gm1_d); errorCode = errorCode | clSetKernelArg(kernelPointer,24,sizeof(cl_mem ),&qinf_d); errorCode = errorCode | clSetKernelArg(kernelPointer,25,sizeof(cl_mem ),&eps_d); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); blockOffset += blocksPerGrid; } op_timers(&cpu_t2, &wall_t2); op_timing_realloc(0); OP_kernels[1].name = userSubroutine; OP_kernels[1].count = OP_kernels[1].count + 1; }