// -------------------- extern "C" void magma_zgetmatrix( magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex* hB_dst, magma_int_t ldhb, magma_queue_t queue ) { if (m <= 0 || n <= 0) return; size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t host_orig[3] = { 0, 0, 0 }; size_t region[3] = { m*sizeof(magmaDoubleComplex), n, 1 }; cl_int err = clEnqueueReadBufferRect( queue, dA_src, CL_TRUE, // blocking buffer_origin, host_orig, region, ldda*sizeof(magmaDoubleComplex), 0, ldhb*sizeof(magmaDoubleComplex), 0, hB_dst, 0, NULL, g_event ); check_error( err ); }
// -------------------- magma_err_t magma_zgetmatrix( magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex* hA_dst, size_t hA_offset, magma_int_t ldha, magma_queue_t queue ) { if (m<=0 || n <= 0) return MAGMA_SUCCESS; size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t host_orig[3] = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 }; size_t region[3] = { m*sizeof(magmaDoubleComplex), n, 1 }; cl_int err = clEnqueueReadBufferRect( queue, dA_src, CL_TRUE, // blocking buffer_origin, host_orig, region, ldda*sizeof(magmaDoubleComplex), 0, ldha*sizeof(magmaDoubleComplex), 0, hA_dst, 0, NULL, gevent ); return err; }
// -------------------- extern "C" void magma_cgetmatrix_async( magma_int_t m, magma_int_t n, magmaFloatComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda, magmaFloatComplex* hB_dst, magma_int_t ldhb, magma_queue_t queue, magma_event_t *event ) { if (m <= 0 || n <= 0) return; size_t buffer_origin[3] = { dA_offset*sizeof(magmaFloatComplex), 0, 0 }; size_t host_orig[3] = { 0, 0, 0 }; size_t region[3] = { m*sizeof(magmaFloatComplex), n, 1 }; cl_int err = clEnqueueReadBufferRect( queue, dA_src, CL_FALSE, // non-blocking buffer_origin, host_orig, region, ldda*sizeof(magmaFloatComplex), 0, ldhb*sizeof(magmaFloatComplex), 0, hB_dst, 0, NULL, event ); clFlush(queue); check_error( err ); }
void DeviceMatrixCL3D_copyFromDevice(const DeviceMatrixCL3D& self, float* dst) { if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) { const int mem_size = self.dim_y *self.dim_t * self.pitch_y; TheContext * tc = new TheContext(); // printf("%d x %d\n",self.pitch_y,self.pitch_t); //printf("--->%d x %d x %d\n",self.dim_x,self.dim_y,self.dim_t); size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {self.dim_x * sizeof(float), self.dim_y, self.dim_t}; float prueba[5][2][3]; //PyArray_DATA(retval.ptr()); cl_int err = clEnqueueReadBufferRect( tc->getMyContext()->cqCommandQueue, self.dataMatrix, CL_TRUE, buffer_origin, host_origin, region, //self.pitch_y, self.dim_x * self.dim_y * sizeof(float), //self.pitch_y, 0, self.pitch_y, 0, self.dim_x * sizeof(float), 0, dst, 0, NULL, NULL); //std::cout<<prueba[2][2][2]<<" "<<prueba[0][0][2]<<endl; if (err != 0){ std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl; } } }
/* D2H */ void copyPitched(void* dst, cl_mem src, cl_command_queue queue, size_t w, size_t h, size_t pitch) const { const size_t offset[3] = {0, 0, 0}; size_t region[3] = {w, h, 1}; CHECK_CL(clEnqueueReadBufferRect( queue, src, Blocking<T_Async>::value, offset, // buffer origin offset, // host origin region, pitch, // buffer row pitch 0, // buffer slice pitch 0, // host row pitch 0, // host slice pitch dst, 0, // num_events_in_wait_list nullptr, // event_wait_list nullptr )); // event }
/*! Reads the bytes defined by \a rect and \a bufferBytesPerLine from this buffer into the supplied \a data array, with a line pitch of \a hostBytesPerLine. Returns true if the read was successful; false otherwise. This function will block until the request finishes. The request is executed on the active command queue for context(). This function is only supported in OpenCL 1.1 and higher. \sa readRectAsync(), writeRect() */ bool QCLBuffer::readRect (const QRect &rect, void *data, size_t bufferBytesPerLine, size_t hostBytesPerLine) { #ifdef QT_OPENCL_1_1 size_t bufferOrigin[3] = {rect.x(), rect.y(), 0}; size_t bufferRegion[3] = {rect.width(), rect.height(), 1}; static size_t const hostOrigin[3] = {0, 0, 0}; cl_int error = clEnqueueReadBufferRect (context()->activeQueue(), memoryId(), CL_TRUE, bufferOrigin, hostOrigin, bufferRegion, bufferBytesPerLine, 0, hostBytesPerLine, 0, data, 0, 0, 0); context()->reportError("QCLBuffer::readRect:", error); return error == CL_SUCCESS; #else context()->reportError("QCLBuffer::readRect:", CL_INVALID_OPERATION); Q_UNUSED(rect); Q_UNUSED(data); Q_UNUSED(bufferBytesPerLine); Q_UNUSED(hostBytesPerLine); return false; #endif }
/// Enqueues a command to read a rectangular region from \p buffer to /// host memory. /// /// \see_opencl_ref{clEnqueueReadBufferRect} /// /// \opencl_version_warning{1,1} void enqueue_read_buffer_rect(const buffer &buffer, const size_t buffer_origin[3], const size_t host_origin[3], const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *host_ptr, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(buffer.get_context() == this->get_context()); BOOST_ASSERT(host_ptr != 0); cl_int ret = clEnqueueReadBufferRect( m_queue, buffer.get(), CL_TRUE, buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, host_ptr, events.size(), events.get_event_ptr(), 0 ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); // Acquire the context so we can use the command queue. This also avoids multiple // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy // the same buffer. ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { #ifdef ENABLE_OPENCL_11 // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we // can deal with the 2 innermost strides with OpenCL. uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3]; size_t offset[3] = { off, 0, 0 }; size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] }; DEBUG_PRINTF( user_context, " clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n", z, w, (void *)c.src, c.dst, (int)off, (int)region[0], (int)region[1], (int)region[2], (int)c.stride_bytes[0], (int)c.stride_bytes[1]); cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, offset, offset, region, c.stride_bytes[0], c.stride_bytes[1], c.stride_bytes[0], c.stride_bytes[1], (void *)c.dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err); return err; } #else for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n", x, y, z, w, (long long)size, (void *)src, dst ); cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, off, size, dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err); return err; } } } #endif } } // The writes above are all non-blocking, so empty the command // queue before we proceed so that other host code won't read // bad data. clFinish(ctx.cmd_queue); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, j, err; /* Data and buffers */ float full_matrix[80], zero_matrix[80]; const size_t buffer_origin[3] = {5*sizeof(float), 3, 0}; const size_t host_origin[3] = {1*sizeof(float), 1, 0}; const size_t region[3] = {4*sizeof(float), 4, 1}; cl_mem matrix_buffer; /* Initialize data */ for(i=0; i<80; i++) { full_matrix[i] = i*1.0f; zero_matrix[i] = 0.0; } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create the kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a buffer to hold 80 floats */ matrix_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(full_matrix), full_matrix, &err); if(err < 0) { perror("Couldn't create a buffer object"); exit(1); } /* Set buffer as argument to the kernel */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &matrix_buffer); if(err < 0) { perror("Couldn't set the buffer as the kernel argument"); exit(1); } /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Enqueue command to write to buffer */ err = clEnqueueWriteBuffer(queue, matrix_buffer, CL_TRUE, 0, sizeof(full_matrix), full_matrix, 0, NULL, NULL); if(err < 0) { perror("Couldn't write to the buffer object"); exit(1); } /* Enqueue command to read rectangle of data */ err = clEnqueueReadBufferRect(queue, matrix_buffer, CL_TRUE, buffer_origin, host_origin, region, 10*sizeof(float), 0, 10*sizeof(float), 0, zero_matrix, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the rectangle from the buffer object"); exit(1); } /* Display updated buffer */ for(i=0; i<8; i++) { for(j=0; j<10; j++) { printf("%6.1f", zero_matrix[j+i*10]); } printf("\n"); } /* Deallocate resources */ clReleaseMemObject(matrix_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; cl_int hostBuffer[NUM_BUFFER_ELEMENTS] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, sizeof(int) * NUM_BUFFER_ELEMENTS, hostBuffer, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } for(int i = 0; i < numOfDevices; ++i) { /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } cl_int outputPtr[16] = {-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1}; for(int idx = 0; idx < 4; ++ idx) { size_t buffer_origin[3] = {idx*2*sizeof(int), idx, 0}; size_t host_origin[3] = {idx*2*sizeof(int), idx, 0}; size_t region[3] = {2*sizeof(int), 2, 1}; /* Enqueue the read-back from device to host */ error = clEnqueueReadBufferRect( cQ, UDObj, CL_TRUE, // blocking read buffer_origin, host_origin, region, 0, // buffer_row_pitch 0, // buffer_slice_pitch 0, // host_row_pitch 0, // host_slice_pitch outputPtr, 0, NULL, NULL); } #ifdef DEBUG for(int i = 0; i < 16; i++) printf("%d\n", outputPtr[i]); #endif if (valuesOK(hostBuffer, outputPtr,16)) printf("Check passed!\n"); else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); /* Clean up */ }// end of device loop and execution clReleaseMemObject(UDObj); clReleaseContext(context); }// end of platform loop }//end of main
int main(int argc, char *argv[]) { int error, xsize, ysize, rgb_max; int *r, *b, *g; float *gray, *congray, *congray_cl; // identity kernel // float filter[] = { // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,1,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // }; // 45 degree motion blur float filter[] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; // mexican hat kernel // float filter[] = { // 0, 0,-1,-1,-1, 0, 0, // 0,-1,-3,-3,-3,-1, 0, // -1,-3, 0, 7, 0,-3,-1, // -1,-3, 7,24, 7,-3,-1, // -1,-3, 0, 7, 0,-3,-1, // 0,-1,-3,-3,-3,-1, 0, // 0, 0,-1,-1,-1, 0, 0 // }; if(argc != 3) { fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]); abort(); } const char* filename = argv[1]; const int num_loops = atoi(argv[2]); // -------------------------------------------------------------------------- // load image // -------------------------------------------------------------------------- printf("Reading ``%s''\n", filename); ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b); printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize); // -------------------------------------------------------------------------- // allocate CPU buffers // -------------------------------------------------------------------------- posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float)); if(!gray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float)); if(!congray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float)); if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); } // -------------------------------------------------------------------------- // convert image to grayscale // -------------------------------------------------------------------------- for(int n = 0; n < xsize*ysize; ++n) gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max; // -------------------------------------------------------------------------- // execute filter on cpu // -------------------------------------------------------------------------- for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { float sum = 0; for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k) { for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l) { sum += gray[(i+k)*xsize + (j+l)] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; } } congray[i*xsize + j] = sum; } } // -------------------------------------------------------------------------- // output cpu filtered image // -------------------------------------------------------------------------- printf("Writing cpu filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max); error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // get an OpenCL context and queue // -------------------------------------------------------------------------- cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("convolution.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL); free(knl_text); #ifdef NON_OPTIMIZED int deviceWidth = xsize; #else int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX; #endif int deviceHeight = ysize; size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0, deviceDataSize, gray, 0, NULL, NULL)); #else size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), ysize, 1}; clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, gray, 0, NULL, NULL); #endif CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- cl_int rows = ysize; cl_int cols = xsize; cl_int filterWidth = FILTER_WIDTH; cl_int paddingPixels = 2*HALF_FILTER_WIDTH; size_t local_size[] = { WGX, WGY }; size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0], ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1], }; cl_int localWidth = local_size[0] + paddingPixels; cl_int localHeight = local_size[1] + paddingPixels; size_t localMemSize = localWidth * localHeight * sizeof(float); CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter)); CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows)); CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols)); CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth)); CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL)); CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight)); CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth)); // -------------------------------------------------------------------------- // print kernel info // -------------------------------------------------------------------------- print_kernel_info(queue, knl); CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int loop = 0; loop < num_loops; ++loop) { CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL, global_size, local_size, 0, NULL, NULL)); // Edit: Copy the blurred image to input buffer #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0, deviceDataSize, 0, NULL, NULL)); #else clEnqueueCopyBufferRect(queue, buf_congray, buf_gray, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, 0, NULL, NULL); #endif } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops; printf("%f s\n", elapsed); printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed); printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed); printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH) *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueReadBuffer( queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0, xsize * ysize * sizeof(float), congray_cl, 0, NULL, NULL)); #else buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; region[0] = (xsize-paddingPixels)*sizeof(float); region[1] = (ysize-paddingPixels); region[2] = 1; clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, congray_cl, 0, NULL, NULL); #endif // -------------------------------------------------------------------------- // output OpenCL filtered image // -------------------------------------------------------------------------- printf("Writing OpenCL filtered image\n"); // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect for(int n = 0; n < xsize*ysize; ++n) { int color = (int)(congray_cl[n] * rgb_max); if (color < 0) { color = 0; } else if (color > 255) { color = 255; } r[n] = g[n] = b[n] = color; } error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_congray)); CALL_CL_SAFE(clReleaseMemObject(buf_gray)); CALL_CL_SAFE(clReleaseMemObject(buf_filter)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(gray); free(congray); free(congray_cl); free(r); free(b); free(g); }
END_TEST START_TEST (test_read_write_rect) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_int result; cl_mem buf, buf_part; // Grid xyz = (5 x 7 x 2) unsigned char grid[70] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1, 2, 3, 2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 3, 1, 0, 0, 2, 3, 2, 0, 1, 3, 3, 3, 1, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3 }; // Middle of the "image" : 3 x 3 x 2 centered at (3, 3) unsigned char part[18] = { 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 3, 1, 2, 3, 2, 3, 3, 3 }; unsigned char buffer[70], buffer_part[18]; size_t host_origin[3] = {0, 0, 0}; size_t buf_origin[3] = {0, 0, 0}; size_t region[3] = {5, 7, 2}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer), buffer, &result); fail_if( result != CL_SUCCESS, "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer" ); buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer_part), buffer_part, &result); fail_if( result != CL_SUCCESS, "cannot create a buffer for the part that will be read" ); // Write grid into buffer result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin, region, 0, 0, 0, 0, grid, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write buffer rect event with pitches guessed" ); fail_if( std::memcmp(buffer, grid, sizeof(buffer)) != 0, "buffer doesn't contain the data" ); // Read it back into a temporary region buf_origin[0] = 1; buf_origin[1] = 2; buf_origin[2] = 0; // host_origin remains (0, 0, 0) region[0] = 3; region[1] = 3; region[2] = 2; result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin, region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "unable to queue a blocking write buffer rect event with host pitches guessed" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read" ); // Clear the temporary region and re-read into it using buf_part std::memset(buffer_part, 0, sizeof(buffer_part)); cl_event event; result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin, host_origin, region, 5, 5*7, 0, 0, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer rect event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for the event" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read using a buffer" ); clReleaseEvent(event); clReleaseMemObject(buf_part); clReleaseMemObject(buf); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
/** Thread that sends data from server to client in * 2D or 3D mode. * @param data struct dataTransfer casted variable. * @return NULL */ void *asyncDataSendRect_thread(void *data) { unsigned int i,j,k,n; size_t buffsize = BUFF_SIZE*sizeof(char); struct dataSend* _data = (struct dataSend*)data; // We may wait manually for the events provided because // OpenCL can only waits their events, but ocalnd event // can be relevant. We will not check for errors, // assuming than events can be wrong, but is to late to // try to report a fail. if(_data->num_events_in_wait_list){ oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list); } // Call to OpenCL size_t host_origin[3] = {0, 0, 0}; clEnqueueReadBufferRect(_data->command_queue,_data->mem,CL_FALSE, _data->buffer_origin,host_origin,_data->region, _data->buffer_row_pitch,_data->buffer_slice_pitch, _data->host_row_pitch,_data->host_slice_pitch, _data->ptr,0,NULL,&(_data->event->event)); // Start sending data to client int *fd = &(_data->fd); Send(fd, &buffsize, sizeof(size_t), 0); // Compute the number of packages needed n = _data->host_row_pitch / buffsize; // Wait until data is copied here. We will not test // for errors, user can do it later clWaitForEvents(1,&(_data->event->event)); // Send the rows size_t origin = 0; for(j=0;j<_data->region[1];j++){ for(k=0;k<_data->region[2];k++){ // Send package by pieces for(i=0;i<n;i++){ Send(fd, _data->ptr + i*buffsize + origin, buffsize, 0); } if(_data->host_row_pitch % buffsize){ // Remains some data to arrive Send(fd, _data->ptr + n*buffsize + origin, _data->host_row_pitch % buffsize, 0); } // Compute the new origin origin += _data->host_row_pitch; } } free(_data->buffer_origin); _data->buffer_origin = NULL; free(_data->region); _data->region = NULL; free(_data->ptr); _data->ptr = NULL; if(_data->event){ _data->event->status = CL_COMPLETE; } if(_data->want_event != CL_TRUE){ free(_data->event); _data->event = NULL; } if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL; // shutdown(fd, 2); // shutdown(_data->fd, 2); // Destroy the server to free the port close(fd); close(_data->fd); free(_data); _data=NULL; pthread_exit(NULL); return NULL; }
int main(int argc, char** argv) { // Set up the data on the host clock_t start, start0; start0 = clock(); start = clock(); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Pad the number of columns #ifdef NON_OPTIMIZED int deviceWidth = imageWidth; #else // READ_ALIGNED || READ4 int deviceWidth = roundUp(imageWidth, WGX); #endif int deviceHeight = imageHeight; // Size of the input and output images on the device int deviceDataSize = imageHeight*deviceWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); int i, j; for(i = 0; i < imageHeight; i++) { for(j = 0; j < imageWidth; j++) { outputImage[i*imageWidth+j] = 0; } } // 45 degree motion blur float filter[49] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; int filterWidth = 7; int paddingPixels = (int)(filterWidth/2) * 2; stoptime(start, "set up input, output."); start = clock(); // Set up the OpenCL environment // Discovery platform cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); size_t time_res; clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL); printf("Device profiling timer resolution: %zu ns.\n", time_res); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, NULL); // Create command queue cl_ulong time_start, time_end, exec_time; cl_event timing_event; cl_command_queue queue; queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); // Create memory buffers cl_mem d_inputImage; cl_mem d_outputImage; cl_mem d_filter; d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL); d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL); d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 49*sizeof(float),NULL, NULL); // Write input data to the device #ifdef NON_OPTIMIZED clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), imageHeight, 1}; clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, inputImage, 0, NULL, NULL); #endif // Write the filter to the device clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 49*sizeof(float), filter, 0, NULL, NULL); // Read in the program from file char* source = readSource("convolution.cl"); // Create the program cl_program program; // Create and compile the program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); cl_int build_status; build_status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Create the kernel cl_kernel kernel; #if defined NON_OPTIMIZED || defined READ_ALIGNED // Only the host-side code differs for the aligned reads kernel = clCreateKernel(program, "convolution", NULL); #else // READ4 kernel = clCreateKernel(program, "convolution_read4", NULL); #endif // Selected work group size is 16x16 int wgWidth = WGX; int wgHeight = WGY; // When computing the total number of work items, the // padding work items do not need to be considered int totalWorkItemsX = roundUp(imageWidth-paddingPixels, wgWidth); int totalWorkItemsY = roundUp(imageHeight-paddingPixels, wgHeight); // Size of a work group size_t localSize[2] = {wgWidth, wgHeight}; // Size of the NDRange size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY}; // The amount of local data that is cached is the size of the // work groups plus the padding pixels #if defined NON_OPTIMIZED || defined READ_ALIGNED int localWidth = localSize[0] + paddingPixels; #else // READ4 // Round the local width up to 4 for the read4 kernel int localWidth = roundUp(localSize[0]+paddingPixels, 4); #endif int localHeight = localSize[1] + paddingPixels; // Compute the size of local memory (needed for dynamic // allocation) size_t localMemSize = (localWidth * localHeight * sizeof(float)); // Set the kernel arguments clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter); clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight); clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); clSetKernelArg(kernel, 6, localMemSize, NULL); clSetKernelArg(kernel, 7, sizeof(int), &localHeight); clSetKernelArg(kernel, 8, sizeof(int), &localWidth); stoptime(start, "set up kernel"); start = clock(); // Execute the kernel clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, &timing_event); // Wait for kernel to complete clFinish(queue); stoptime(start, "run kernel"); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); exec_time = time_end-time_start; printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000); // Read back the output image #ifdef NON_OPTIMIZED clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 // Begin reading output from (3,3) on the device // (for 7x7 filter with radius 3) buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; // Read data into (3,3) on the host host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; // Region is image size minus padding pixels region[0] = (imageWidth-paddingPixels)*sizeof(float); region[1] = (imageHeight-paddingPixels); region[2] = 1; // Perform the read clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, outputImage, 0, NULL, NULL); #endif // Homegrown function to write the image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); // Free OpenCL objects clReleaseMemObject(d_inputImage); clReleaseMemObject(d_outputImage); clReleaseMemObject(d_filter); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }