cl_int WINAPI wine_clEnqueueMarker(cl_command_queue command_queue, cl_event * event) { cl_int ret; TRACE("\n"); ret = clEnqueueMarker(command_queue, event); return ret; }
/*! Synchronizes the host against the active command queue. This function will block until all currently queued commands have finished execution. */ void QCLContext::sync() { cl_event event; cl_int error = clEnqueueMarker(activeQueue(), &event); reportError("QCLContext::sync:", error); if (error == CL_SUCCESS) { clWaitForEvents(1, &event); clReleaseEvent(event); } }
/*! Returns a marker event for the active command queue. The event will be signalled when all commands that were queued before this point have finished. \sa barrier(), sync() */ QCLEvent QCLContext::marker() { cl_event evid; cl_int error = clEnqueueMarker(activeQueue(), &evid); reportError("QCLContext::marker:", error); if (error != CL_SUCCESS) return QCLEvent(); else return QCLEvent(evid); }
cl_mem cl_fft<float>::runInternal(const cl_mem input, cl_event *out_startEvent, cl_event *out_kernelEvents) { CL_CHECK_ERR("clEnqueueMarker", clEnqueueMarker(command_queue, out_startEvent)); // Lanci del kernel const cl_uint Nhalf = samplesPerRun / 2; cl_event prev_evt = *out_startEvent; for (unsigned int i = 0; i < launches.size(); i++) { if (launches[i].isOptibase == false) { // Solo il primo step ha input reali cl_kernel kernel = (i == 0) ? k_fftstep_real2cpx : k_fftstep_cpx2cpx; CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 0, sizeof(cl_mem), (i == 0) ? &input : &v_tmp1)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 1, sizeof(cl_mem), &v_tmp2)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 2, sizeof(cl_mem), &v_twiddleFactors)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 3, sizeof(cl_uint), &launches[i].Wshift)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(kernel, 4, sizeof(cl_uint), &Nhalf)); CL_CHECK_ERR("clEnqueueNDRangeKernel", clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, launches[i].globalSize, launches[i].groupSize, 1, &prev_evt, &out_kernelEvents[i] )); } else { CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 0, sizeof(cl_mem), &v_tmp1)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 1, sizeof(cl_mem), &v_tmp2)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 2, sizeof(cl_mem), &v_twiddleFactors)); CL_CHECK_ERR("clSetKernelArg", clSetKernelArg(k_fftstep_optibase, 3, sizeof(cl_uint), &Nhalf)); CL_CHECK_ERR("clEnqueueNDRangeKernel", clEnqueueNDRangeKernel(command_queue, k_fftstep_optibase, 1, NULL, launches[i].globalSize, launches[i].groupSize, 1, &prev_evt, &out_kernelEvents[i] )); } prev_evt = out_kernelEvents[i]; swap(v_tmp1, v_tmp2); } return v_tmp1; }
void Wait( cl_command_queue queue ) { cl_event wait; cl_int status = clEnqueueMarker( queue, &wait ); if( status != CL_SUCCESS ) fprintf( stderr, "Wait: clEnqueueMarker failed\n" ); status = clEnqueueWaitForEvents( queue, 1, &wait ); if( status != CL_SUCCESS ) fprintf( stderr, "Wait: clEnqueueWaitForEvents failed\n" ); }
/// Enqueues a marker in the queue and returns an event that can be /// used to track its progress. event enqueue_marker() { event event_; #ifdef CL_VERSION_1_2 cl_int ret = clEnqueueMarkerWithWaitList(m_queue, 0, 0, &event_.get()); #else cl_int ret = clEnqueueMarker(m_queue, &event_.get()); #endif if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } return event_; }
int acc_event_record (void* event, void* stream){ // debug info if (verbose_print){ fprintf(stdout, "\n ... EVENT RECORDING ... \n"); fprintf(stdout, " ---> Entering: acc_event_record.\n"); } // local event and queue pointers cl_event *clevent = (cl_event *) event; acc_opencl_stream_type *clstream = (acc_opencl_stream_type *) stream; // set a marker 'event' to listen on to queue 'stream' cl_error = clEnqueueMarker((*clstream).queue, clevent); if (acc_opencl_error_check(cl_error, __LINE__)) return -1; // debug info if (verbose_print){ fprintf(stdout, " ---> Leaving: acc_event_record.\n"); } // assign return value return 0; }
/*! \brief * Launch asynchronously the download of nonbonded forces from the GPU * (and energies/shift forces if required). */ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int aloc) { cl_int gmx_unused cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ int iloc = -1; /* determine interaction locality from atom locality */ if (LOCAL_A(aloc)) { iloc = eintLocal; } else if (NONLOCAL_A(aloc)) { iloc = eintNonlocal; } else { char stmp[STRLEN]; sprintf(stmp, "Invalid atom locality passed (%d); valid here is only " "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal); gmx_incons(stmp); } cl_atomdata_t *adat = nb->atdat; cl_timers_t *t = nb->timers; bool bDoTime = nb->bDoTime; cl_command_queue stream = nb->stream[iloc]; bool bCalcEner = flags & GMX_FORCE_ENERGY; int bCalcFshift = flags & GMX_FORCE_VIRIAL; /* don't launch non-local copy-back if there was no non-local work to do */ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0) { return; } /* calculate the atom data index range based on locality */ if (LOCAL_A(aloc)) { adat_begin = 0; adat_len = adat->natoms_local; } else { adat_begin = adat->natoms_local; adat_len = adat->natoms - adat->natoms_local; } /* beginning of timed D2H section */ /* With DD the local D2H transfer can only start after the non-local has been launched. */ if (iloc == eintLocal && nb->bUseTwoStreams) { sync_ocl_event(stream, &(nb->nonlocal_done)); } /* DtoH f */ ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float), (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL); /* kick off work */ cl_error = clFlush(stream); assert(CL_SUCCESS == cl_error); /* After the non-local D2H is launched the nonlocal_done event can be recorded which signals that the local D2H can proceed. This event is not placed after the non-local kernel because we first need the non-local data back first. */ if (iloc == eintNonlocal) { #ifdef CL_VERSION_1_2 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done)); #else cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done)); #endif assert(CL_SUCCESS == cl_error); } /* only transfer energies in the local stream */ if (LOCAL_I(iloc)) { /* DtoH fshift */ if (bCalcFshift) { ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL); } /* DtoH energies */ if (bCalcEner) { ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL); ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL); } } debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len); }
/*! \brief Launch GPU kernel As we execute nonbonded workload in separate queues, before launching the kernel we need to make sure that he following operations have completed: - atomdata allocation and related H2D transfers (every nstlist step); - pair list H2D transfer (every nstlist step); - shift vector H2D transfer (every nstlist step); - force (+shift force and energy) output clearing (every step). These operations are issued in the local queue at the beginning of the step and therefore always complete before the local kernel launch. The non-local kernel is launched after the local on the same device/context, so this is inherently scheduled after the operations in the local stream (including the above "misc_ops"). However, for the sake of having a future-proof implementation, we use the misc_ops_done event to record the point in time when the above operations are finished and synchronize with this event in the non-local stream. */ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, const struct nbnxn_atomdata_t *nbatom, int flags, int iloc) { cl_int cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* OpenCL kernel launch-related stuff */ int shmem; size_t local_work_size[3], global_work_size[3]; cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */ cl_atomdata_t *adat = nb->atdat; cl_nbparam_t *nbp = nb->nbparam; cl_plist_t *plist = nb->plist[iloc]; cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; bool bCalcEner = flags & GMX_FORCE_ENERGY; int bCalcFshift = flags & GMX_FORCE_VIRIAL; bool bDoTime = nb->bDoTime; cl_uint arg_no; cl_nbparam_params_t nbparams_params; #ifdef DEBUG_OCL float * debug_buffer_h; size_t debug_buffer_size; #endif /* turn energy calculation always on/off (for debugging/testing only) */ bCalcEner = (bCalcEner || always_ener) && !never_ener; /* Don't launch the non-local kernel if there is no work to do. Doing the same for the local kernel is more complicated, since the local part of the force array also depends on the non-local kernel. So to avoid complicating the code and to reduce the risk of bugs, we always call the local kernel, the local x+q copy and later (not in this function) the stream wait, local f copyback and the f buffer clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ if (iloc == eintNonlocal && plist->nsci == 0) { return; } /* calculate the atom data index range based on locality */ if (LOCAL_I(iloc)) { adat_begin = 0; adat_len = adat->natoms_local; } else { adat_begin = adat->natoms_local; adat_len = adat->natoms - adat->natoms_local; } /* beginning of timed HtoD section */ /* HtoD x, q */ ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL); /* When we get here all misc operations issues in the local stream as well as the local xq H2D are done, so we record that in the local stream and wait for it in the nonlocal one. */ if (nb->bUseTwoStreams) { if (iloc == eintLocal) { #ifdef CL_VERSION_1_2 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done)); #else cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done)); #endif assert(CL_SUCCESS == cl_error); /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed * in the local stream in order to be able to sync with the above event * from the non-local stream. */ cl_error = clFlush(stream); assert(CL_SUCCESS == cl_error); } else { sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done)); } } if (plist->nsci == 0) { /* Don't launch an empty local kernel (is not allowed with OpenCL). * TODO: Separate H2D and kernel launch into separate functions. */ return; } /* beginning of timed nonbonded calculation section */ /* get the pointer to the kernel flavor we need to use */ nb_kernel = select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, bCalcEner, plist->bDoPrune || always_prune); /* kernel launch config */ local_work_size[0] = CL_SIZE; local_work_size[1] = CL_SIZE; local_work_size[2] = 1; global_work_size[0] = plist->nsci * local_work_size[0]; global_work_size[1] = 1 * local_work_size[1]; global_work_size[2] = 1 * local_work_size[2]; validate_global_work_size(global_work_size, 3, nb->dev_info); shmem = calc_shmem_required(); #ifdef DEBUG_OCL { static int run_step = 1; if (DEBUG_RUN_STEP == run_step) { debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float); debug_buffer_h = (float*)calloc(1, debug_buffer_size); assert(NULL != debug_buffer_h); if (NULL == nb->debug_buffer) { nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, debug_buffer_size, debug_buffer_h, &cl_error); assert(CL_SUCCESS == cl_error); } } run_step++; } #endif if (debug) { fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t" "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n", (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]), (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL, NCL_PER_SUPERCL, plist->na_c); } fillin_ocl_structures(nbp, &nbparams_params); arg_no = 0; cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl)); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift); cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL); cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer)); assert(cl_error == CL_SUCCESS); if (cl_error) { printf("ClERROR! %d\n", cl_error); } cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL); assert(cl_error == CL_SUCCESS); #ifdef DEBUG_OCL { static int run_step = 1; if (DEBUG_RUN_STEP == run_step) { FILE *pf; char file_name[256] = {0}; ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0, debug_buffer_size, stream, NULL); // Make sure all data has been transfered back from device clFinish(stream); printf("\nWriting debug_buffer to debug_buffer_ocl.txt..."); sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP); pf = fopen(file_name, "wt"); assert(pf != NULL); fprintf(pf, "%20s", ""); for (int j = 0; j < global_work_size[0]; j++) { char label[20]; sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]); fprintf(pf, "%20s", label); } for (int i = 0; i < global_work_size[1]; i++) { char label[20]; sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]); fprintf(pf, "\n%20s", label); for (int j = 0; j < global_work_size[0]; j++) { fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]); } //fprintf(pf, "\n"); } fclose(pf); printf(" done.\n"); free(debug_buffer_h); debug_buffer_h = NULL; } run_step++; } #endif }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; cl_device_id cdDevice; cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command queue cl_mem c_Kernel, d_Input, d_Buffer, d_Output; //OpenCL memory buffer objects cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cl_int ciErrNum; const unsigned int imageW = 3072; const unsigned int imageH = 3072; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclConvolutionSeparable.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Kernel = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float)); h_Input = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_Buffer = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); srand(2009); for(unsigned int i = 0; i < KERNEL_LENGTH; i++) h_Kernel[i] = (cl_float)(rand() % 16); for(unsigned int i = 0; i < imageW * imageH; i++) h_Input[i] = (cl_float)(rand() % 16); shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL separable convolution...\n"); initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH); //Just a single run or a warmup iteration convolutionRows( NULL, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( NULL, d_Output, d_Buffer, c_Kernel, imageW, imageH ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++){ convolutionRows( cqCommandQueue, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( cqCommandQueue, d_Output, d_Buffer, c_Kernel, imageW, imageH ); } ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get OpenCL profiler info cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations); #endif shrLog("Reading back OpenCL results...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS); double sum = 0, delta = 0; double L2norm; for(unsigned int i = 0; i < imageW * imageH; i++){ delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]); sum += h_OutputCPU[i] * h_OutputCPU[i]; } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); // cleanup closeConvolutionSeparable(); ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Buffer); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseMemObject(c_Kernel); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); free(h_OutputGPU); free(h_OutputCPU); free(h_Buffer); free(h_Input); free(h_Kernel); // finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED); }
void enqueue() { // Errors in this function can not be handled by opencl_err.hpp // because they require non-standard error handling CAF_LOG_TRACE("command::enqueue()"); this->ref(); // reference held by the OpenCL comand queue cl_event event_k; auto data_or_nullptr = [](const dim_vec& vec) { return vec.empty() ? nullptr : vec.data(); }; // OpenCL expects cl_uint (unsigned int), hence the cast cl_int err = clEnqueueNDRangeKernel( queue_.get(), actor_facade_->kernel_.get(), static_cast<cl_uint>(actor_facade_->config_.dimensions().size()), data_or_nullptr(actor_facade_->config_.offsets()), data_or_nullptr(actor_facade_->config_.dimensions()), data_or_nullptr(actor_facade_->config_.local_dimensions()), static_cast<cl_uint>(mem_in_events_.size()), (mem_in_events_.empty() ? nullptr : mem_in_events_.data()), &event_k ); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clEnqueueNDRangeKernel: " << get_opencl_error(err)); clReleaseEvent(event_k); this->deref(); return; } else { enqueue_read_buffers(event_k, detail::get_indices(result_buffers_)); cl_event marker; #if defined(__APPLE__) err = clEnqueueMarkerWithWaitList( queue_.get(), static_cast<cl_uint>(mem_out_events_.size()), mem_out_events_.data(), &marker ); #else err = clEnqueueMarker(queue_.get(), &marker); #endif if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err)); clReleaseEvent(marker); clReleaseEvent(event_k); this->deref(); // callback is not set return; } err = clSetEventCallback(marker, CL_COMPLETE, [](cl_event, cl_int, void* data) { auto cmd = reinterpret_cast<command*>(data); cmd->handle_results(); cmd->deref(); }, this); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clSetEventCallback: " << get_opencl_error(err)); clReleaseEvent(marker); clReleaseEvent(event_k); this->deref(); // callback is not set return; } err = clFlush(queue_.get()); if (err != CL_SUCCESS) { CAF_LOGMF(CAF_ERROR, "clFlush: " << get_opencl_error(err)); } mem_out_events_.push_back(std::move(event_k)); mem_out_events_.push_back(std::move(marker)); } }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED); }
END_TEST START_TEST (test_misc_events) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_int result; cl_event uevent1, uevent2, marker1, marker2; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); /* * This test will build a command queue blocked by an user event. The events * will be in this order : * * -: UserEvent1 * 0: WaitForEvents1 (wait=UserEvent1) * 1: Marker1 * -: UserEvent2 * 2: WaitForEvents2 (wait=UserEvent2) * 3: Barrier * 4: Marker2 (to check the barrier worked) * * When the command queue is built, we : * - Check that Marker1 is Queued (WaitForEvents waits) * - Set UserEvent1 to Complete * - Check that Marker1 is Complete (WaitForEvents stopped to wait) * - Check that Marker2 is Queued (Barrier is there) * - Set UserEvent2 to Complete * - Check that Marker2 is Complete (no more barrier) */ uevent1 = clCreateUserEvent(ctx, &result); fail_if( result != CL_SUCCESS, "unable to create UserEvent1" ); uevent2 = clCreateUserEvent(ctx, &result); fail_if( result != CL_SUCCESS, "unable to create UserEvent2" ); result = clEnqueueWaitForEvents(queue, 1, &uevent1); fail_if( result != CL_SUCCESS, "unable to enqueue WaitForEvents(UserEvent1)" ); result = clEnqueueMarker(queue, &marker1); fail_if( result != CL_SUCCESS, "unable to enqueue Marker1" ); result = clEnqueueWaitForEvents(queue, 1, &uevent2); fail_if( result != CL_SUCCESS, "unable to enqueue WaitForEvents(UserEvent2)" ); result = clEnqueueBarrier(queue); fail_if( result != CL_SUCCESS, "unable to enqueue Barrier" ); result = clEnqueueMarker(queue, &marker2); fail_if( result != CL_SUCCESS, "unable to enqueue Marker2" ); // Now the checks cl_int status; result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, 0); fail_if( result != CL_SUCCESS || status != CL_QUEUED, "Marker1 must be Queued" ); result = clSetUserEventStatus(uevent1, CL_COMPLETE); fail_if( result != CL_SUCCESS, "unable to set UserEvent1 to Complete" ); result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, 0); fail_if( result != CL_SUCCESS || status != CL_COMPLETE, "Marker1 must be Complete" ); result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, 0); fail_if( result != CL_SUCCESS || status != CL_QUEUED, "Marker2 must be Queued" ); result = clSetUserEventStatus(uevent2, CL_COMPLETE); fail_if( result != CL_SUCCESS, "unable to set UserEvent2 to Complete" ); result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, 0); fail_if( result != CL_SUCCESS || status != CL_COMPLETE, "Marker2 must be Complete" ); clFinish(queue); clReleaseEvent(uevent1); clReleaseEvent(uevent2); clReleaseEvent(marker1); clReleaseEvent(marker2); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
void Extrae_OpenCL_clCreateCommandQueue (cl_command_queue queue, cl_device_id device, cl_command_queue_properties properties) { if (!Extrae_OpenCL_lookForOpenCLQueue (queue, NULL)) { cl_int err; char _threadname[THREAD_INFO_NAME_LEN]; char _hostname[HOST_NAME_MAX]; char *_device_type; int prev_threadid, found, idx; cl_device_type device_type; cl_event event; idx = nCommandQueues; CommandQueues = (RegisteredCommandQueue_t*) realloc ( CommandQueues, sizeof(RegisteredCommandQueue_t)*(nCommandQueues+1)); if (CommandQueues == NULL) { fprintf (stderr, PACKAGE_NAME": Fatal error! Failed to allocate memory for OpenCL Command Queues\n"); exit (-1); } CommandQueues[idx].queue = queue; CommandQueues[idx].isOutOfOrder = (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0; err = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); if (err == CL_SUCCESS) { if (device_type == CL_DEVICE_TYPE_GPU) _device_type = "GPU"; else if (device_type == CL_DEVICE_TYPE_CPU) _device_type = "CPU"; else _device_type = "Other"; } else _device_type = "Unknown"; /* Was the thread created before (i.e. did we executed a cudadevicereset?) */ if (gethostname(_hostname, HOST_NAME_MAX) == 0) sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx, _hostname); else sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx, "unknown-host"); prev_threadid = Extrae_search_thread_name (_threadname, &found); if (found) { /* If thread name existed, reuse its thread id */ CommandQueues[idx].threadid = prev_threadid; } else { /* For timing purposes we change num of threads here instead of doing Backend_getNumberOfThreads() + CUDAdevices*/ Backend_ChangeNumberOfThreads (Backend_getNumberOfThreads() + 1); CommandQueues[idx].threadid = Backend_getNumberOfThreads()-1; /* Set thread name */ Extrae_set_thread_name (CommandQueues[idx].threadid, _threadname); } CommandQueues[idx].nevents = 0; #ifdef CL_VERSION_1_2 err = clEnqueueBarrierWithWaitList (queue, 0, NULL, &event); #else err = clEnqueueBarrier (queue); if (err == CL_SUCCESS) err = clEnqueueMarker (queue, &event); #endif CommandQueues[idx].host_reference_time = TIME; if (err == CL_SUCCESS) { err = clFinish(queue); if (err != CL_SUCCESS) { fprintf (stderr, PACKAGE_NAME": Error in clFinish (error = %d)! Dying...\n", err); exit (-1); } err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &(CommandQueues[idx].device_reference_time), NULL); if (err != CL_SUCCESS) { fprintf (stderr, PACKAGE_NAME": Error in clGetEventProfilingInfo (error = %d)! Dying...\n", err); exit (-1); } } else { fprintf (stderr, PACKAGE_NAME": Error while looking for clock references in host & accelerator\n"); exit (-1); } nCommandQueues++; } }