// host stub function void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { ops_arg args[4] = { arg0, arg1, arg2, arg3}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args,4,range,29)) return; #endif ops_timing_realloc(29,"calc_dt_kernel_get"); OPS_kernels[29].count++; //compute locally allocated range for the sub-block int start[2]; int end[2]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for ( int n=0; n<2; n++ ){ start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; if (start[n] >= range[2*n]) { start[n] = 0; } else { start[n] = range[2*n] - start[n]; } if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; if (end[n] >= range[2*n+1]) { end[n] = range[2*n+1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); } #else //OPS_MPI for ( int n=0; n<2; n++ ){ start[n] = range[2*n];end[n] = range[2*n+1]; } #endif //OPS_MPI int x_size = MAX(0,end[0]-start[0]); int y_size = MAX(0,end[1]-start[1]); int xdim0 = args[0].dat->size[0]*args[0].dat->dim; int xdim1 = args[1].dat->size[0]*args[1].dat->dim; //build opencl kernel if not already built buildOpenCLKernels_calc_dt_kernel_get( xdim0,xdim1); //Timing double t1,t2,c1,c2; ops_timers_core(&c2,&t2); //set up OpenCL thread blocks size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,1}; #ifdef OPS_MPI double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); #else //OPS_MPI double *arg2h = (double *)(((ops_reduction)args[2].data)->data); #endif //OPS_MPI #ifdef OPS_MPI double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); #else //OPS_MPI double *arg3h = (double *)(((ops_reduction)args[3].data)->data); #endif //OPS_MPI int nblocks = ((x_size-1)/OPS_block_size_x+ 1)*((y_size-1)/OPS_block_size_y + 1); int maxblocks = nblocks; int reduct_bytes = 0; reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); reallocReductArrays(reduct_bytes); reduct_bytes = 0; int r_bytes2 = reduct_bytes/sizeof(double); arg2.data = OPS_reduct_h + reduct_bytes; arg2.data_d = OPS_reduct_d;// + reduct_bytes; for (int b=0; b<maxblocks; b++) for (int d=0; d<1; d++) ((double *)arg2.data)[d+b*1] = ZERO_double; reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); int r_bytes3 = reduct_bytes/sizeof(double); arg3.data = OPS_reduct_h + reduct_bytes; arg3.data_d = OPS_reduct_d;// + reduct_bytes; for (int b=0; b<maxblocks; b++) for (int d=0; d<1; d++) ((double *)arg3.data)[d+b*1] = ZERO_double; reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); mvReductArraysToDevice(reduct_bytes); int dat0 = args[0].dat->elem_size; int dat1 = args[1].dat->elem_size; //set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif //OPS_MPI int base0 = 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif //OPS_MPI int base1 = 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); ops_H_D_exchanges_device(args, 4); ops_halo_exchanges(args,4,range); ops_H_D_exchanges_device(args, 4); ops_timers_core(&c1,&t1); OPS_kernels[29].mpi_time += t1-t2; int nthread = OPS_block_size_x*OPS_block_size_y; clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 0, sizeof(cl_mem), (void*) &arg0.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 1, sizeof(cl_mem), (void*) &arg1.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 2, sizeof(cl_mem), (void*) &arg2.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 3, nthread*sizeof(double), NULL)); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 4, sizeof(cl_int), (void*) &r_bytes2 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 5, sizeof(cl_mem), (void*) &arg3.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 6, nthread*sizeof(double), NULL)); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 7, sizeof(cl_int), (void*) &r_bytes3 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 8, sizeof(cl_int), (void*) &base0 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 9, sizeof(cl_int), (void*) &base1 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 10, sizeof(cl_int), (void*) &x_size )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[29], 11, sizeof(cl_int), (void*) &y_size )); //call/enque opencl kernel wrapper function clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[29], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); if (OPS_diags>1) { clSafeCall( clFinish(OPS_opencl_core.command_queue) ); } mvReductArraysToHost(reduct_bytes); for ( int b=0; b<maxblocks; b++ ){ for ( int d=0; d<1; d++ ){ arg2h[d] = arg2h[d] + ((double *)arg2.data)[d+b*1]; } } arg2.data = (char *)arg2h; for ( int b=0; b<maxblocks; b++ ){ for ( int d=0; d<1; d++ ){ arg3h[d] = arg3h[d] + ((double *)arg3.data)[d+b*1]; } } arg3.data = (char *)arg3h; ops_set_dirtybit_device(args, 4); //Update kernel record ops_timers_core(&c2,&t2); OPS_kernels[29].time += t2-t1; OPS_kernels[29].transfer += ops_compute_transfer(dim, range, &arg0); OPS_kernels[29].transfer += ops_compute_transfer(dim, range, &arg1); }
void update_host(const char *userSubroutine,op_set set,op_arg opDat1,op_arg opDat2,op_arg opDat3,op_arg opDat4,op_arg opDat5) { size_t blocksPerGrid; size_t threadsPerBlock; size_t totalThreadNumber; size_t dynamicSharedMemorySize; cl_int errorCode; cl_event event; cl_kernel kernelPointer; int sharedMemoryOffset; int i1; int i2; int reductionBytes; int reductionSharedMemorySize; float *reductionArrayHost5; blocksPerGrid = 200; threadsPerBlock = threadsPerBlockSize_update; totalThreadNumber = threadsPerBlock * blocksPerGrid; dynamicSharedMemorySize = 0; dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); dynamicSharedMemorySize = MAX(dynamicSharedMemorySize,sizeof(float ) * 4); sharedMemoryOffset = dynamicSharedMemorySize * OP_WARPSIZE; dynamicSharedMemorySize = dynamicSharedMemorySize * threadsPerBlock; reductionBytes = 0; reductionSharedMemorySize = 0; reductionArrayHost5 = ((float *)opDat5.data); reductionBytes += ROUND_UP(blocksPerGrid * sizeof(float ) * 1); reductionSharedMemorySize = MAX(reductionSharedMemorySize,sizeof(float )); reallocReductArrays(reductionBytes); reductionBytes = 0; opDat5.data = OP_reduct_h + reductionBytes; opDat5.data_d = ((char *)OP_reduct_d) + reductionBytes; for (i1 = 0; i1 < blocksPerGrid; ++i1) { for (i2 = 0; i2 < 1; ++i2) { ((float *)opDat5.data)[i2 + i1 * 1] = 0.00000F; } } reductionBytes += ROUND_UP(blocksPerGrid * sizeof(float ) * 1); mvReductArraysToDevice(reductionBytes); kernelPointer = getKernel("update_kernel"); errorCode = clSetKernelArg(kernelPointer,0,sizeof(cl_mem ),&opDat1.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,1,sizeof(cl_mem ),&opDat2.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,2,sizeof(cl_mem ),&opDat3.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,3,sizeof(cl_mem ),&opDat4.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,4,sizeof(cl_mem ),&opDat5.data_d); errorCode = errorCode | clSetKernelArg(kernelPointer,5,sizeof(int ),&sharedMemoryOffset); errorCode = errorCode | clSetKernelArg(kernelPointer,6,sizeof(int ),&set -> size); errorCode = errorCode | clSetKernelArg(kernelPointer,7,dynamicSharedMemorySize,NULL); assert_m(errorCode == CL_SUCCESS,"Error setting OpenCL kernel arguments"); errorCode = clEnqueueNDRangeKernel(cqCommandQueue,kernelPointer,1,NULL,&totalThreadNumber,&threadsPerBlock,0,NULL,&event); assert_m(errorCode == CL_SUCCESS,"Error executing OpenCL kernel"); errorCode = clFinish(cqCommandQueue); assert_m(errorCode == CL_SUCCESS,"Error completing device command queue"); mvReductArraysToHost(reductionBytes); for (i1 = 0; i1 < blocksPerGrid; ++i1) { for (i2 = 0; i2 < 1; ++i2) { reductionArrayHost5[i2] += ((float *)opDat5.data)[i2 + i1 * 1]; } } }