TEST_P(ocl_engine_test, BasicInteropC) { auto p = GetParam(); cl_device_id ocl_dev = (p.adev_kind == dev_kind::gpu) ? gpu_ocl_dev : (p.adev_kind == dev_kind::cpu) ? cpu_ocl_dev : nullptr; cl_context ocl_ctx = (p.actx_kind == ctx_kind::gpu) ? gpu_ocl_ctx : (p.actx_kind == ctx_kind::cpu) ? cpu_ocl_ctx : nullptr; SKIP_IF(p.adev_kind != dev_kind::null && !ocl_dev, "Required OpenCL device not found."); SKIP_IF(p.actx_kind != ctx_kind::null && !ocl_ctx, "Required OpenCL context not found."); SKIP_IF(cpu_ocl_dev == gpu_ocl_dev && (p.adev_kind == dev_kind::cpu || p.actx_kind == ctx_kind::cpu), "OpenCL CPU-only device not found."); mkldnn_engine_t eng; mkldnn_status_t s = mkldnn_engine_create_ocl(&eng, mkldnn_gpu, ocl_dev, ocl_ctx); EXPECT_EQ(s, p.expected_status); if (s == mkldnn_success) { cl_device_id dev; cl_context ctx; MKLDNN_CHECK(mkldnn_engine_get_ocl_device(eng, &dev)); MKLDNN_CHECK(mkldnn_engine_get_ocl_context(eng, &ctx)); EXPECT_EQ(dev, ocl_dev); EXPECT_EQ(ctx, ocl_ctx); cl_uint ref_count; OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count, nullptr)); int i_ref_count = int(ref_count); EXPECT_EQ(i_ref_count, 2); MKLDNN_CHECK(mkldnn_engine_destroy(eng)); OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count, nullptr)); i_ref_count = int(ref_count); EXPECT_EQ(i_ref_count, 1); } }
TEST_P(ocl_engine_test, BasicInteropCpp) { auto p = GetParam(); cl_device_id ocl_dev = (p.adev_kind == dev_kind::gpu) ? gpu_ocl_dev : (p.adev_kind == dev_kind::cpu) ? cpu_ocl_dev : nullptr; cl_context ocl_ctx = (p.actx_kind == ctx_kind::gpu) ? gpu_ocl_ctx : (p.actx_kind == ctx_kind::cpu) ? cpu_ocl_ctx : nullptr; SKIP_IF(p.adev_kind != dev_kind::null && !ocl_dev, "Required OpenCL device not found."); SKIP_IF(p.actx_kind != ctx_kind::null && !ocl_ctx, "Required OpenCL context not found."); SKIP_IF(cpu_ocl_dev == gpu_ocl_dev && (p.adev_kind == dev_kind::cpu || p.actx_kind == ctx_kind::cpu), "OpenCL CPU-only device not found."); catch_expected_failures( [&]() { { engine eng(engine::kind::gpu, ocl_dev, ocl_ctx); if (p.expected_status != mkldnn_success) { FAIL() << "Success not expected"; } cl_device_id dev = eng.get_ocl_device(); cl_context ctx = eng.get_ocl_context(); EXPECT_EQ(dev, ocl_dev); EXPECT_EQ(ctx, ocl_ctx); cl_uint ref_count; OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count, nullptr)); int i_ref_count = int(ref_count); EXPECT_EQ(i_ref_count, 2); } cl_uint ref_count; OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count), &ref_count, nullptr)); int i_ref_count = int(ref_count); EXPECT_EQ(i_ref_count, 1); }, p.expected_status != mkldnn_success, p.expected_status); }
status_t ocl_engine_t::init() { CHECK(cl_engine_t::init()); cl_int err = CL_SUCCESS; if (is_user_context_) { err = clRetainContext(context_); if (err != CL_SUCCESS) context_ = nullptr; } else { context_ = clCreateContext(nullptr, 1, &device_, nullptr, nullptr, &err); } OCL_CHECK(err); status_t status = ocl_utils::check_device(engine_kind::gpu, device_, context_); if (status != status::success) return status; stream_t *service_stream_ptr; status = create_stream(&service_stream_ptr, stream_flags::default_flags); if (status != status::success) return status; service_stream_.reset(service_stream_ptr); return status::success; }
void ConvolutionLayerSpatial<Dtype>::swizzleWeights( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp swizzled_factor) { viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel( CL_KERNEL_SELECT("copyWeightsSwizzled")); cl_uint argIdx = 0; int_tp channels = this->channels_ / this->group_; oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); oclk_copy_weight.arg(argIdx++, kernel_w_); oclk_copy_weight.arg(argIdx++, kernel_h_); oclk_copy_weight.arg(argIdx++, channels); oclk_copy_weight.arg(argIdx++, this->num_output_); oclk_copy_weight.arg(argIdx++, swizzled_factor); const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_ * channels * kernel_w_ * kernel_h_), 1, 1 }; OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), oclk_copy_weight.handle().get(), 3, NULL, global_work_size_Copy, NULL, 0, NULL, NULL)); }
virtual void SetUp() { gpu_ocl_dev = find_ocl_device(CL_DEVICE_TYPE_GPU); cpu_ocl_dev = find_ocl_device(CL_DEVICE_TYPE_CPU); cl_int err; if (gpu_ocl_dev) { gpu_ocl_ctx = clCreateContext( nullptr, 1, &gpu_ocl_dev, nullptr, nullptr, &err); OCL_CHECK(err); } if (cpu_ocl_dev) { cpu_ocl_ctx = clCreateContext( nullptr, 1, &cpu_ocl_dev, nullptr, nullptr, &err); OCL_CHECK(err); } }
void PatternMatcher::findMatchesInText(cl_mem textBuffer, cl_mem patternBuffer, cl_mem matchBuffer, cl_uint textSize, cl_uint patternSize, cl_uint maxMismatch) { OCL_STATUS_INITIALIZE; cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "find_matches", &OCL_STATUS)); OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &textBuffer)); OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &patternBuffer)); OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &matchBuffer)); OCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uint), &patternSize)); OCL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uint), &maxMismatch)); size_t globalWorkSize[1] = { textSize - patternSize + 1 }; OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr)); OCL_CHECK(clReleaseKernel(kernel)); }
void PatternMatcher::computePrefixSum(cl_mem & inputBuffer, cl_uint bufferElementCount) { OCL_STATUS_INITIALIZE; cl_mem localBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_WRITE, bufferElementCount * sizeof(cl_uint), nullptr, &OCL_STATUS)); cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "prefix_sum_step", &OCL_STATUS)); size_t globalWorkSize[1] = { bufferElementCount }; OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_uint), &bufferElementCount)); for (cl_uint offset = 1; offset < bufferElementCount; offset *= 2) { OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_uint), &offset)); OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputBuffer)); OCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &localBuffer)); OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr)); std::swap(inputBuffer, localBuffer); } OCL_CHECK(clReleaseMemObject(localBuffer)); OCL_CHECK(clReleaseKernel(kernel)); }
void PatternMatcher::packIndicesOfValueSteps(cl_mem packedIndicesBuffer, cl_mem valuesBuffer, cl_uint valuesCount) { OCL_STATUS_INITIALIZE; cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "pack_indices_of_value_steps", &OCL_STATUS)); OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &packedIndicesBuffer)); OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &valuesBuffer)); OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_uint), &valuesCount)); size_t globalWorkSize[1] = { valuesCount }; OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr)); OCL_CHECK(clReleaseKernel(kernel)); }
PatternMatcher::~PatternMatcher() { OCL_STATUS_INITIALIZE; OCL_CHECK(clReleaseProgram(program)); OCL_CHECK(clReleaseCommandQueue(queue)); OCL_CHECK(clReleaseContext(context)); }
std::vector<cl_uint> PatternMatcher::findPattern(char const * pattern, char const * text, cl_uint maxMismatch) { OCL_STATUS_INITIALIZE; size_t textSize = strlen(text); size_t patternSize = strlen(pattern); size_t possibleMatchSites = textSize - patternSize + 1; // create a buffer for storing text on device and copy text to device cl_mem textBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_ONLY, textSize, nullptr, &OCL_STATUS)); OCL_CHECK(clEnqueueWriteBuffer(queue, textBuffer, CL_FALSE, 0, textSize, text, 0, nullptr, nullptr)); // create a buffer for storing pattern on device and copy pattern to device cl_mem patternBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_ONLY, patternSize, nullptr, &OCL_STATUS)); OCL_CHECK(clEnqueueWriteBuffer(queue, patternBuffer, CL_FALSE, 0, patternSize, pattern, 0, nullptr, nullptr)); // create a buffer for storing flags indicating starting positions of found matches size_t matchBufferSize = possibleMatchSites * sizeof(cl_uint); cl_mem matchBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_WRITE, matchBufferSize, nullptr, &OCL_STATUS)); // launch kernel to find matches in text buffer if (maxMismatch == 0) { this->findExactMatchesInText(textBuffer, patternBuffer, matchBuffer, textSize, patternSize); } else { this->findMatchesInText(textBuffer, patternBuffer, matchBuffer, textSize, patternSize, maxMismatch); } // release the text and pattern buffers OCL_CHECK(clReleaseMemObject(textBuffer)); OCL_CHECK(clReleaseMemObject(patternBuffer)); // compute prefix sum of match-starts buffer this->computePrefixSum(matchBuffer, textSize); // read total of matches found from last element of match-starts buffer cl_uint matchCount = 0; cl_uint offsetOfFinalMatchBufferElement = (possibleMatchSites - 1) * sizeof(cl_uint); OCL_CHECK(clEnqueueReadBuffer(queue, matchBuffer, CL_TRUE, offsetOfFinalMatchBufferElement, sizeof(cl_uint), &matchCount, 0, nullptr, nullptr)); // create a buffer to store packed locations of match starts size_t locationsBufferSize = matchCount * sizeof(cl_uint); cl_mem locationsBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_WRITE_ONLY, locationsBufferSize, nullptr, &OCL_STATUS)); // pack the found location coordinates into the locations buffer this->packIndicesOfValueSteps(locationsBuffer, matchBuffer, possibleMatchSites); // release the match buffer OCL_CHECK(clReleaseMemObject(matchBuffer)); // read the match-start positions from the device into a vector to be returned to caller vector<cl_uint> matchLocations(matchCount); OCL_CHECK(clEnqueueReadBuffer(queue, locationsBuffer, CL_TRUE, 0, locationsBufferSize, &matchLocations[0], 0, nullptr, nullptr)); // release OpenCL resources OCL_CHECK(clReleaseMemObject(locationsBuffer)); // return vector match starts return matchLocations; }
void run_opencl_fo(HMM *word) { puts("\n=>GPU"); int N = word->nstates; int T = word->len; float *B = word->b; // T x N float *A = word->a; // N x N float *prior = word->pri; cl_ulong gstart, gend; double gpuTime; int i; float *At; // NxN At = (float*)malloc(sizeof(float)*N*N); // initialize for checking float *alpha; alpha = (float*)malloc(sizeof(float)*T*N); // T x B //init_2d_f(alpha,T,N,0.f); int blks = (N+255)/256; //float *alphasum; // T x 1 //alphasum = (float*)malloc(sizeof(float)*T); //init_1d_f(alphasum,T,0.f); float *lld; lld = (float*)malloc(sizeof(float)); lld[0] = 0.f; float *at_alpha; at_alpha = (float*)malloc(sizeof(float)*N); uint startPos_pre; uint startPos; int numK = 4; int numE = 2; cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*numK); cl_event *events = (cl_event*)malloc(sizeof(cl_event)*numE); //------------------------------------------------ // OpenCL //------------------------------------------------ cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program /* cl_event **eventwait = (cl_event**)malloc(sizeof(cl_event*)*2); for(i=0;i<2;++i){ eventwait[i] = (cl_event*)malloc(sizeof(cl_event)*2); } */ cl_int err; // read kernel file char *fileName = "ocl_fo_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); // kernel[0] = clCreateKernel(program, "transpose", &err); OCL_CHECK(err); kernel[1] = clCreateKernel(program, "init_alpha", &err); OCL_CHECK(err); kernel[2] = clCreateKernel(program, "mat_vec", &err); OCL_CHECK(err); kernel[3] = clCreateKernel(program, "alpha_dev", &err); OCL_CHECK(err); // allocate memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem At_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem lld_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, NULL); cl_mem B_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*T*N, NULL, NULL); cl_mem prior_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); cl_mem alpha_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*T*N, NULL, NULL); cl_mem at_alpha_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); // cl_mem alphasum_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*T, NULL, NULL); // // cl_mem alphasum_tmp_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*blks, NULL, NULL); // // warm up() device float *dummy = (float*)malloc(sizeof(float)); cl_mem dummy_d= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, NULL); for(i=0;i<50;++i){ err = clEnqueueWriteBuffer(queue, dummy_d, CL_TRUE, 0, sizeof(float), dummy, 0, NULL, NULL); } // copy from host to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL, &events[0]); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, prior_d, CL_TRUE, 0, sizeof(float)*N, prior, 0, NULL, NULL); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, B_d, CL_TRUE, 0, sizeof(float)*T*N, B, 0, NULL, NULL); OCL_CHECK(err); // // err = clEnqueueWriteBuffer(queue, alpha_d, CL_TRUE, 0, sizeof(float)*T*N, alpha, 0, NULL, NULL); // OCL_CHECK(err); // // err = clEnqueueWriteBuffer(queue, alphasum_d, CL_TRUE, 0, sizeof(float)*T, alphasum, 0, NULL, NULL); // OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, lld_d, CL_TRUE, 0, sizeof(float), lld, 0, NULL, NULL); OCL_CHECK(err); //---------------------------------------- transpose kernel -------------------------------------------// // 1st kernel: size_t local_0[2]; size_t global_0[2]; local_0[0]= 16; local_0[1]= 16; global_0[0] = N; global_0[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &At_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 2, sizeof(float)*256, NULL); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 3, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, global_0, local_0, 0, NULL, NULL ); OCL_CHECK(err); //---------------------------------------- init_alpha kernel -------------------------------------------// // 2nd kernel: initialize alpha size_t local_1; size_t global_1; local_1 = 256; global_1= 256; err = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &B_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &prior_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &alpha_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &lld_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 4, sizeof(float)*256, NULL); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 5, sizeof(int), &blks); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, &global_1, &local_1, 0, NULL, NULL ); OCL_CHECK(err); // clFinish(queue); // clEnqueueReadBuffer(queue, alpha_d, CL_TRUE, 0, sizeof(float)*T*N, alpha, 0, NULL , NULL); // for(i=0;i<N;++i){ // printf("%.4e\n", alpha[i]); // } // printf("done!\n"); //---------------------------------------- matrix vector multiplication kernel -------------------------------------------// // 3rd kernel size_t local_2[2]; size_t global_2[2]; local_2[0]= 16; local_2[1]= 16; global_2[0] = 16; global_2[1] = N; err = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &At_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &alpha_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 2, sizeof(cl_mem), &at_alpha_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 3, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} //---------------------------------------- alpha dev kernel -------------------------------------------// // 4th kernel size_t local_3; size_t global_3; local_3 = 256; global_3 = 256; err = clSetKernelArg(kernel[3], 0, sizeof(cl_mem), &B_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 1, sizeof(cl_mem), &at_alpha_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 2, sizeof(cl_mem), &alpha_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 3, sizeof(cl_mem), &lld_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 4, sizeof(float)*256, NULL); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 5, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[3], 6, sizeof(int), &blks); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} int frame; //for(frame = 1 ; frame < 2; ++frame) for(frame = 1 ; frame < T; ++frame) { startPos = frame * N; startPos_pre = startPos - N; err = clSetKernelArg(kernel[2], 4, sizeof(uint), &startPos_pre); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} // At x alpha = at_alpha err = clEnqueueNDRangeKernel(queue, kernel[2], 2, NULL, global_2, local_2, 0, NULL, NULL ); OCL_CHECK(err); // alpha dev err = clSetKernelArg(kernel[3], 7, sizeof(uint), &startPos); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[3], 1, NULL, &global_3, &local_3, 0, NULL, NULL ); OCL_CHECK(err); //clFinish(queue); //clEnqueueReadBuffer(queue, lld_d, CL_TRUE, 0, sizeof(float), lld, 0, NULL , NULL); //printf("\n\n (%d) lld = %.4e\n", frame, lld[0]); // clFinish(queue); // clEnqueueReadBuffer(queue, alpha_d, CL_TRUE, 0, sizeof(float)*T*N, alpha, 0, NULL , NULL); // for(i=0; i<N; ++i){ // printf("%.4e\n", alpha[startPos + i]); // } } clFinish(queue); clEnqueueReadBuffer(queue, lld_d, CL_TRUE, 0, sizeof(float), lld , 0, NULL , &events[1]); printf("lld = %.4f\n", lld[0]); err = clWaitForEvents(1,&events[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; printf("oclTime = %lf (s)\n", gpuTime); clReleaseMemObject(A_d); clReleaseMemObject(At_d); clReleaseMemObject(B_d); clReleaseMemObject(prior_d); clReleaseMemObject(alpha_d); clReleaseMemObject(lld_d); clReleaseMemObject(at_alpha_d); clReleaseMemObject(dummy_d); //clReleaseMemObject(alphasum_d); //clReleaseMemObject(alphasum_tmp_d); //clReleaseMemObject(alphamid_d); clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<numK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<numE;++i){ clReleaseEvent(events[i]); } free(kernelSource); free(At); free(alpha); free(lld); free(at_alpha); free(dummy); //free(alphasum); return; }
void run_opencl_backward(HMM *word) { puts("\n=>GPU"); int i; int N = word->nstates; int T = word->len; float *B = word->b; float *A = word->a; // gpu timer cl_ulong gstart, gend; double gpuTime; // cpu timer //struct timeval cstart; //struct timeval cend; double cpuTime; float *betaB; betaB= (float*)malloc(sizeof(float)*N); init_1d_f(betaB,N,0.f); float *beta; // NxT beta = (float*)malloc(sizeof(float)*N*T); init_2d_f(beta,N,T,0.f); for(i = 0 ; i < N ; ++i){ beta[i*T + T-1] = 1.f; } //------------------------------------------------ // OpenCL //------------------------------------------------ int chunks; chunks = (N+63)/64; cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*3); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*2); // read kernel file char *fileName = "backward_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); kernel[0] = clCreateKernel(program, "genbetaB", &err); OCL_CHECK(err); kernel[1] = clCreateKernel(program, "beta_dev", &err); OCL_CHECK(err); kernel[2] = clCreateKernel(program, "scale_beta", &err); OCL_CHECK(err); // allocate memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem B_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*T, NULL, NULL); cl_mem beta_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*T, NULL, NULL); cl_mem betaB_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); cl_mem betasum_int_d= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*chunks, NULL, NULL); cl_mem betasum_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, NULL); // warm up() device float *dummy = (float*)malloc(sizeof(float)); cl_mem dummy_d= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, NULL); for(i=0;i<50;++i){ err = clEnqueueWriteBuffer(queue, dummy_d, CL_TRUE, 0, sizeof(float), dummy, 0, NULL, NULL); } // Initialize device memory err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL, &event[0]); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, B_d, CL_TRUE, 0, sizeof(float)*N*T, B, 0, NULL, NULL); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL, NULL); OCL_CHECK(err); // 1st kernel: beta * B size_t local_1 = 64; size_t global_1 = chunks*64; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &beta_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &betaB_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[0], 3, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[0], 4, sizeof(int), &T); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} // 2nd kernel: A * betaB size_t local_2 = 64; size_t global_2 = chunks*64; err = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &betaB_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &beta_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &betasum_int_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[1], 4, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[1], 5, sizeof(int), &T); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} //err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame); //if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[1], 7, sizeof(float)*64, NULL); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} // 3nd kernel: beta/sum size_t local_3 = 64; size_t global_3 = chunks*64; err = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &beta_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &betasum_int_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 2, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 3, sizeof(int), &T); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err |= clSetKernelArg(kernel[2], 5, sizeof(float), &chunks); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} // time capsule int frame; for(frame = (T-2) ; frame >= 0; frame--) { // 1st kernel : beta * B err |= clSetKernelArg(kernel[0], 5, sizeof(int), &frame); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &global_1, &local_1, 0, NULL, NULL); OCL_CHECK(err); if(frame == (T-2) && 0) { clFinish(queue); clEnqueueReadBuffer(queue, betaB_d, CL_TRUE, 0, sizeof(float)*N, betaB, 0, NULL , NULL); check_1d_f(betaB, N); exit(1); } // 2nd kernel; betaB * A err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, &global_2, &local_2, 0, NULL, NULL); OCL_CHECK(err); if(frame == (T-2) && 0) { clFinish(queue); clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL); check_2d_f(beta, N, T); exit(1); } // 3rd kernle; scale beta err |= clSetKernelArg(kernel[2], 4, sizeof(int), &frame); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[2], 1, NULL, &global_3, &local_3, 0, NULL, NULL); OCL_CHECK(err); if(frame == (T-2) && 0) { clFinish(queue); clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL); check_2d_f(beta, N, T); exit(1); } } clFinish(queue); clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; cpuTime = 0.0; printf("oclTime = %lf (s)\n", gpuTime + cpuTime); // check //check_2d_f(beta,N,T); clReleaseMemObject(A_d); clReleaseMemObject(B_d); clReleaseMemObject(beta_d); clReleaseMemObject(betaB_d); clReleaseMemObject(dummy_d); clReleaseMemObject(betasum_d); clReleaseMemObject(betasum_int_d); clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<3;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<2;++i){ clReleaseEvent(event[i]); } free(beta); free(betaB); free(kernelSource); free(dummy); return; }
status_t init() { OCL_CHECK(device_info_.init()); return status::success; }
void run1(int N, char *fileName) { puts("Matrix Vector Multiplication Naive\n"); int i,j; float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = 1.f; } } float *B; B = (float*)malloc(sizeof(float)*N); for( i = 0; i < N ; ++i ) { B[i] = 1.f; } float *C; C = (float*)malloc(sizeof(float)*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); puts("B"); check_1d_f(B,N); #endif int NumK = 1; int NumE = 1; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL); OCL_CHECK(err); // Print the binary out to the output file fh = fopen("kernel_mv_1.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); #endif kernel[0] = clCreateKernel(program, "mv_1", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem B_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); cl_mem C_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); // Initialize device memory err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , NULL); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, B_d, CL_TRUE, 0, sizeof(float)*N, B, 0, NULL , NULL); OCL_CHECK(err); size_t localsize = 64; size_t globalsize = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 3, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]); OCL_CHECK(err); clFinish(queue); clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL ); err = clWaitForEvents(1,&event[0]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("C = A * B"); check_1d_f(C,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(B_d); clReleaseMemObject(C_d); clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(B); free(C); return; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }