Example #1
0
// --------------------
extern "C" void
magma_zgetmatrix(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
    magmaDoubleComplex*          hB_dst,                   magma_int_t ldhb,
    magma_queue_t queue )
{
    if (m <= 0 || n <= 0)
       return;

    size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t host_orig[3]     = { 0, 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueReadBufferRect(
        queue, dA_src, CL_TRUE,  // blocking
        buffer_origin, host_orig, region,
        ldda*sizeof(magmaDoubleComplex), 0,
        ldhb*sizeof(magmaDoubleComplex), 0,
        hB_dst, 0, NULL, g_event );
    check_error( err );
}
Example #2
0
// --------------------
magma_err_t
magma_zgetmatrix(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
    magmaDoubleComplex*          hA_dst, size_t hA_offset, magma_int_t ldha,
    magma_queue_t queue )
{
    if (m<=0 || n <= 0)
      return MAGMA_SUCCESS;

    size_t buffer_origin[3] = { dA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t host_orig[3]     = { hA_offset*sizeof(magmaDoubleComplex), 0, 0 };
    size_t region[3]        = { m*sizeof(magmaDoubleComplex), n, 1 };
    cl_int err = clEnqueueReadBufferRect(
        queue, dA_src, CL_TRUE,  // blocking
        buffer_origin, host_orig, region,
        ldda*sizeof(magmaDoubleComplex), 0,
        ldha*sizeof(magmaDoubleComplex), 0,
        hA_dst, 0, NULL, gevent );
    return err;
}
Example #3
0
// --------------------
extern "C" void
magma_cgetmatrix_async(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex_const_ptr dA_src, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex*          hB_dst,                   magma_int_t ldhb,
    magma_queue_t queue, magma_event_t *event )
{
    if (m <= 0 || n <= 0)
        return;

    size_t buffer_origin[3] = { dA_offset*sizeof(magmaFloatComplex), 0, 0 };
    size_t host_orig[3]     = { 0, 0, 0 };
    size_t region[3]        = { m*sizeof(magmaFloatComplex), n, 1 };
    cl_int err = clEnqueueReadBufferRect(
        queue, dA_src, CL_FALSE,  // non-blocking
        buffer_origin, host_orig, region,
        ldda*sizeof(magmaFloatComplex), 0,
        ldhb*sizeof(magmaFloatComplex), 0,
        hB_dst, 0, NULL, event );
    clFlush(queue);
    check_error( err );
}
Example #4
0
void DeviceMatrixCL3D_copyFromDevice(const DeviceMatrixCL3D& self, float* dst)
{
    if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) {
	
		const int mem_size = self.dim_y *self.dim_t * self.pitch_y;
	    	
        TheContext * tc = new TheContext();

	 //   printf("%d x %d\n",self.pitch_y,self.pitch_t);

		//printf("--->%d  x %d  x  %d\n",self.dim_x,self.dim_y,self.dim_t);
		
		size_t buffer_origin[3] = {0,0,0};
		size_t host_origin[3] = {0,0,0};	
        size_t region[3] = {self.dim_x * sizeof(float),
            self.dim_y,
            self.dim_t};	
		float prueba[5][2][3];
		//PyArray_DATA(retval.ptr());
        cl_int err =
		clEnqueueReadBufferRect(
			tc->getMyContext()->cqCommandQueue,
			self.dataMatrix, CL_TRUE,
			buffer_origin, host_origin, region,
			//self.pitch_y, self.dim_x * self.dim_y * sizeof(float),
			//self.pitch_y, 0,
			self.pitch_y, 0,
			self.dim_x * sizeof(float), 0,
			dst,
			0, NULL, NULL);
			//std::cout<<prueba[2][2][2]<<" "<<prueba[0][0][2]<<endl;
		
        if (err != 0){
            std::cout << "Error in copyFromDevice (CODE: " << err << ")" << std::endl;		
		}
	}
}
Example #5
0
 /* D2H */
 void copyPitched(void* dst,
                  cl_mem src,
                  cl_command_queue queue,
                  size_t w,
                  size_t h,
                  size_t pitch) const
 {
   const size_t offset[3] = {0, 0, 0};
   size_t region[3] = {w, h, 1};
   CHECK_CL(clEnqueueReadBufferRect( queue,
                                     src,
                                     Blocking<T_Async>::value,
                                     offset, // buffer origin
                                     offset, // host origin
                                     region,
                                     pitch, // buffer row pitch
                                     0, // buffer slice pitch
                                     0, // host row pitch
                                     0, // host slice pitch
                                     dst,
                                     0, // num_events_in_wait_list
                                     nullptr, // event_wait_list
                                     nullptr )); // event
 }
Example #6
0
/*!
    Reads the bytes defined by \a rect and \a bufferBytesPerLine
    from this buffer into the supplied \a data array, with a line
    pitch of \a hostBytesPerLine.  Returns true if the read
    was successful; false otherwise.

    This function will block until the request finishes.
    The request is executed on the active command queue for context().

    This function is only supported in OpenCL 1.1 and higher.

    \sa readRectAsync(), writeRect()
*/
bool QCLBuffer::readRect
(const QRect &rect, void *data,
 size_t bufferBytesPerLine, size_t hostBytesPerLine)
{
#ifdef QT_OPENCL_1_1
    size_t bufferOrigin[3] = {rect.x(), rect.y(), 0};
    size_t bufferRegion[3] = {rect.width(), rect.height(), 1};
    static size_t const hostOrigin[3] = {0, 0, 0};
    cl_int error = clEnqueueReadBufferRect
                   (context()->activeQueue(), memoryId(),
                    CL_TRUE, bufferOrigin, hostOrigin, bufferRegion,
                    bufferBytesPerLine, 0, hostBytesPerLine, 0,
                    data, 0, 0, 0);
    context()->reportError("QCLBuffer::readRect:", error);
    return error == CL_SUCCESS;
#else
    context()->reportError("QCLBuffer::readRect:", CL_INVALID_OPERATION);
    Q_UNUSED(rect);
    Q_UNUSED(data);
    Q_UNUSED(bufferBytesPerLine);
    Q_UNUSED(hostBytesPerLine);
    return false;
#endif
}
Example #7
0
    /// Enqueues a command to read a rectangular region from \p buffer to
    /// host memory.
    ///
    /// \see_opencl_ref{clEnqueueReadBufferRect}
    ///
    /// \opencl_version_warning{1,1}
    void enqueue_read_buffer_rect(const buffer &buffer,
                                  const size_t buffer_origin[3],
                                  const size_t host_origin[3],
                                  const size_t region[3],
                                  size_t buffer_row_pitch,
                                  size_t buffer_slice_pitch,
                                  size_t host_row_pitch,
                                  size_t host_slice_pitch,
                                  void *host_ptr,
                                  const wait_list &events = wait_list())
    {
        BOOST_ASSERT(m_queue != 0);
        BOOST_ASSERT(buffer.get_context() == this->get_context());
        BOOST_ASSERT(host_ptr != 0);

        cl_int ret = clEnqueueReadBufferRect(
            m_queue,
            buffer.get(),
            CL_TRUE,
            buffer_origin,
            host_origin,
            region,
            buffer_row_pitch,
            buffer_slice_pitch,
            host_row_pitch,
            host_slice_pitch,
            host_ptr,
            events.size(),
            events.get_event_ptr(),
            0
        );

        if(ret != CL_SUCCESS){
            BOOST_THROW_EXCEPTION(opencl_error(ret));
        }
    }
Example #8
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    // Acquire the context so we can use the command queue. This also avoids multiple
    // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy
    // the same buffer.
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
#ifdef ENABLE_OPENCL_11
                // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we
                // can deal with the 2 innermost strides with OpenCL.
                uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3];

                size_t offset[3] = { off, 0, 0 };
                size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] };

                DEBUG_PRINTF( user_context, "    clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n",
                              z, w,
                              (void *)c.src, c.dst, (int)off,
                              (int)region[0], (int)region[1], (int)region[2],
                              (int)c.stride_bytes[0], (int)c.stride_bytes[1]);

                cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE,
                                                     offset, offset, region,
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     (void *)c.dst,
                                                     0, NULL, NULL);

                if (err != CL_SUCCESS) {
                    halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err);
                    return err;
                }
#else
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        void *src = (void *)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;

                        DEBUG_PRINTF( user_context, "    clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n",
                                      x, y, z, w,
                                      (long long)size, (void *)src, dst );

                        cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src,
                                                         CL_FALSE, off, size, dst, 0, NULL, NULL);
                        if (err != CL_SUCCESS) {
                            halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err);
                            return err;
                        }
                    }
                }
#endif
            }
        }
        // The writes above are all non-blocking, so empty the command
        // queue before we proceed so that other host code won't read
        // bad data.
        clFinish(ctx.cmd_queue);

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
int main() {

   /* OpenCL data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel;
   cl_int i, j, err;

   /* Data and buffers */
   float full_matrix[80], zero_matrix[80];
   const size_t buffer_origin[3] = {5*sizeof(float), 3, 0};
   const size_t host_origin[3] = {1*sizeof(float), 1, 0};
   const size_t region[3] = {4*sizeof(float), 4, 1};
   cl_mem matrix_buffer;

   /* Initialize data */
   for(i=0; i<80; i++) {
      full_matrix[i] = i*1.0f;
      zero_matrix[i] = 0.0;
   }   
   
   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build the program and create the kernel */
   program = build_program(context, device, PROGRAM_FILE);
   kernel = clCreateKernel(program, KERNEL_FUNC, &err);
   if(err < 0) {
      perror("Couldn't create a kernel");
      exit(1);   
   };

   /* Create a buffer to hold 80 floats */
   matrix_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | 
      CL_MEM_COPY_HOST_PTR, sizeof(full_matrix), full_matrix, &err);
   if(err < 0) {
      perror("Couldn't create a buffer object");
      exit(1);   
   }

   /* Set buffer as argument to the kernel */
   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &matrix_buffer);
   if(err < 0) {
      perror("Couldn't set the buffer as the kernel argument");
      exit(1);   
   }

   /* Create a command queue */
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };   
   
   /* Enqueue kernel */
   err = clEnqueueTask(queue, kernel, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Enqueue command to write to buffer */
   err = clEnqueueWriteBuffer(queue, matrix_buffer, CL_TRUE, 0,
         sizeof(full_matrix), full_matrix, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't write to the buffer object");
      exit(1);   
   }

   /* Enqueue command to read rectangle of data */
   err = clEnqueueReadBufferRect(queue, matrix_buffer, CL_TRUE, 
         buffer_origin, host_origin, region, 10*sizeof(float), 0, 
         10*sizeof(float), 0, zero_matrix, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the rectangle from the buffer object");
      exit(1);   
   }

   /* Display updated buffer */
   for(i=0; i<8; i++) {
      for(j=0; j<10; j++) {
         printf("%6.1f", zero_matrix[j+i*10]);
      }
      printf("\n");
   }

   /* Deallocate resources */
   clReleaseMemObject(matrix_buffer);
   clReleaseKernel(kernel);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);

   return 0;
}
int main(int argc, char** argv) {

   /* OpenCL 1.1 data structures */
   cl_platform_id* platforms;
   cl_context context;

   /* OpenCL 1.1 scalar data types */
   cl_uint numOfPlatforms;
   cl_int  error;

   cl_int hostBuffer[NUM_BUFFER_ELEMENTS] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
   /* 
      Get the number of platforms 
      Remember that for each vendor's SDK installed on the computer,
      the number of available platform also increased. 
    */
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS ) {			
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   // Search for a CPU/GPU device through the installed platforms
   // Build a OpenCL program and do not run it.
   for(cl_uint i = 0; i < numOfPlatforms; i++ ) {

        cl_uint numOfDevices = 0;

        /* Determine how many devices are connected to your platform */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

        /* Load the information about your devices into the variable 'devices' */
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
        if (error != CL_SUCCESS ) { 
            perror("Unable to obtain any OpenCL compliant device info");
            exit(1);
        }
        printf("Number of detected OpenCL devices: %d\n", numOfDevices);

	    /* Create a context */
        cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 };
	    context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a valid OpenCL context");
	        exit(1);
	    }

	    /* For each device, create a buffer and partition that data among the devices for compute! */
	    cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, 
	                                  sizeof(int) * NUM_BUFFER_ELEMENTS, hostBuffer, &error);
	    if(error != CL_SUCCESS) {
	        perror("Can't create a buffer");
	        exit(1);
	    }

        for(int i = 0; i < numOfDevices; ++i) {
	
	            /* Create a command queue */
	            cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error);
	            if (error != CL_SUCCESS) { 
	                perror("Unable to create command-queue");
	                exit(1);
	            }

                cl_int outputPtr[16] = {-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1};
	            for(int idx = 0; idx < 4; ++ idx) {	
	                size_t buffer_origin[3] = {idx*2*sizeof(int), idx, 0}; 
	                size_t host_origin[3] = {idx*2*sizeof(int), idx, 0}; 
	                size_t region[3] = {2*sizeof(int), 2, 1};
	
		            /* Enqueue the read-back from device to host */
		            error = clEnqueueReadBufferRect(
	                       cQ,
	                       UDObj,
		                   CL_TRUE,               // blocking read
	                       buffer_origin,         
	                       host_origin,
	                       region,
	                       0, // buffer_row_pitch
	                       0, // buffer_slice_pitch
	                       0, // host_row_pitch
	                       0, // host_slice_pitch
	                       outputPtr, 0, NULL, NULL);
	
	            }

            #ifdef DEBUG
	            for(int i = 0; i < 16; i++) printf("%d\n", outputPtr[i]); 
            #endif
            if (valuesOK(hostBuffer, outputPtr,16)) printf("Check passed!\n"); else printf("Check failed!\n");
	            /* Release the command queue */
	            clReleaseCommandQueue(cQ);

        /* Clean up */
        
    }// end of device loop and execution
	    clReleaseMemObject(UDObj);
        clReleaseContext(context);
   }// end of platform loop

}//end of main
Example #11
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n)
    gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;

  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sum = 0;
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l)
        {
          sum += gray[(i+k)*xsize + (j+l)] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray[i*xsize + j] = sum;
    }
  }

  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n)
    r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max);
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = {
    ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],
    ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],
  };

  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));

    // Edit: Copy the blurred image to input buffer
#ifdef NON_OPTIMIZED
    CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0,
          deviceDataSize, 0, NULL, NULL));
#else
    clEnqueueCopyBufferRect(queue, buf_congray, buf_gray,
                            buffer_origin, host_origin, region,
                            deviceWidth*sizeof(float), 0,
                            xsize*sizeof(float), 0,
                            0, NULL, NULL);
#endif
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        xsize * ysize * sizeof(float), congray_cl,
        0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,
      buffer_origin, host_origin, region,
      deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
      congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");

  // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect
  for(int n = 0; n < xsize*ysize; ++n) {
    int color = (int)(congray_cl[n] * rgb_max);

    if (color < 0) {
      color = 0;
    } else if (color > 255) {
      color = 255;
    }

    r[n] = g[n] = b[n] = color;
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}
Example #12
0
END_TEST

START_TEST (test_read_write_rect)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_mem buf, buf_part;

    // Grid xyz = (5 x 7 x 2)
    unsigned char grid[70] = {
        0, 0, 0, 0, 0,
        0, 1, 1, 1, 0,
        1, 2, 2, 2, 1,
        1, 2, 3, 2, 1,
        1, 2, 2, 2, 1,
        0, 1, 1, 1, 0,
        0, 0, 0, 0, 0,

        0, 0, 1, 0, 0,
        0, 0, 2, 0, 0,
        0, 1, 3, 1, 0,
        0, 2, 3, 2, 0,
        1, 3, 3, 3, 1,
        2, 3, 3, 3, 2,
        3, 3, 3, 3, 3
    };

    // Middle of the "image" : 3 x 3 x 2 centered at (3, 3)
    unsigned char part[18] = {
        2, 2, 2,
        2, 3, 2,
        2, 2, 2,

        1, 3, 1,
        2, 3, 2,
        3, 3, 3
    };

    unsigned char buffer[70], buffer_part[18];
    size_t host_origin[3] = {0, 0, 0};
    size_t buf_origin[3] = {0, 0, 0};
    size_t region[3] = {5, 7, 2};

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                         sizeof(buffer), buffer, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
    );

    buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(buffer_part), buffer_part, &result);
    fail_if(
        result != CL_SUCCESS,
        "cannot create a buffer for the part that will be read"
    );

    // Write grid into buffer
    result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin,
                                      region, 0, 0, 0, 0, grid, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "cannot enqueue a blocking write buffer rect event with pitches guessed"
    );
    fail_if(
        std::memcmp(buffer, grid, sizeof(buffer)) != 0,
        "buffer doesn't contain the data"
    );

    // Read it back into a temporary region
    buf_origin[0] = 1;
    buf_origin[1] = 2;
    buf_origin[2] = 0;
    // host_origin remains (0, 0, 0)
    region[0] = 3;
    region[1] = 3;
    region[2] = 2;

    result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin,
                                     region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a blocking write buffer rect event with host pitches guessed"
    );
    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read"
    );

    // Clear the temporary region and re-read into it using buf_part
    std::memset(buffer_part, 0, sizeof(buffer_part));
    cl_event event;

    result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin,
                                     host_origin, region, 5, 5*7, 0, 0, 0, 0, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to queue a copy buffer rect event"
    );

    result = clWaitForEvents(1, &event);
    fail_if(
        result != CL_SUCCESS,
        "unable to wait for the event"
    );

    fail_if(
        std::memcmp(buffer_part, part, sizeof(part)) != 0,
        "the part of the buffer was not correctly read using a buffer"
    );

    clReleaseEvent(event);
    clReleaseMemObject(buf_part);
    clReleaseMemObject(buf);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}
Example #13
0
/** Thread that sends data from server to client in
 * 2D or 3D mode.
 * @param data struct dataTransfer casted variable.
 * @return NULL
 */
void *asyncDataSendRect_thread(void *data)
{
    unsigned int i,j,k,n;
    size_t buffsize = BUFF_SIZE*sizeof(char);
    struct dataSend* _data = (struct dataSend*)data;
    // We may wait manually for the events provided because
    // OpenCL can only waits their events, but ocalnd event
    // can be relevant. We will not check for errors,
    // assuming than events can be wrong, but is to late to
    // try to report a fail.
    if(_data->num_events_in_wait_list){
        oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list);
    }
    // Call to OpenCL
    size_t host_origin[3] = {0, 0, 0};
    clEnqueueReadBufferRect(_data->command_queue,_data->mem,CL_FALSE,
                            _data->buffer_origin,host_origin,_data->region,
                            _data->buffer_row_pitch,_data->buffer_slice_pitch,
                            _data->host_row_pitch,_data->host_slice_pitch,
                            _data->ptr,0,NULL,&(_data->event->event));
    // Start sending data to client
    int *fd = &(_data->fd);
    Send(fd, &buffsize, sizeof(size_t), 0);
    // Compute the number of packages needed
    n = _data->host_row_pitch / buffsize;
    // Wait until data is copied here. We will not test
    // for errors, user can do it later
    clWaitForEvents(1,&(_data->event->event));
    // Send the rows
    size_t origin = 0;
    for(j=0;j<_data->region[1];j++){
        for(k=0;k<_data->region[2];k++){
            // Send package by pieces
            for(i=0;i<n;i++){
                Send(fd, _data->ptr + i*buffsize + origin, buffsize, 0);
            }
            if(_data->host_row_pitch % buffsize){
                // Remains some data to arrive
                Send(fd, _data->ptr + n*buffsize + origin, _data->host_row_pitch % buffsize, 0);
            }
            // Compute the new origin
            origin += _data->host_row_pitch;
        }
    }
    free(_data->buffer_origin); _data->buffer_origin = NULL;
    free(_data->region); _data->region = NULL;
    free(_data->ptr); _data->ptr = NULL;
    if(_data->event){
        _data->event->status = CL_COMPLETE;
    }
    if(_data->want_event != CL_TRUE){
        free(_data->event); _data->event = NULL;
    }
    if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL;
    // shutdown(fd, 2);
    // shutdown(_data->fd, 2); // Destroy the server to free the port
    close(fd);
    close(_data->fd);
    free(_data); _data=NULL;
    pthread_exit(NULL);
    return NULL;
}
Example #14
0
int main(int argc, char** argv) {

   // Set up the data on the host	
   clock_t start, start0;
   start0 = clock();
   start = clock();
   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";



   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth, 
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Pad the number of columns 
#ifdef NON_OPTIMIZED
   int deviceWidth = imageWidth;
#else  // READ_ALIGNED || READ4
   int deviceWidth = roundUp(imageWidth, WGX);
#endif
   int deviceHeight = imageHeight;
   // Size of the input and output images on the device
   int deviceDataSize = imageHeight*deviceWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);
   int i, j;
   for(i = 0; i < imageHeight; i++) {
       for(j = 0; j < imageWidth; j++) {
           outputImage[i*imageWidth+j] = 0;
       }
   }

   // 45 degree motion blur
   float filter[49] = 
      {0,      0,      0,      0,      0, 0.0145,      0,
       0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
       0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
       0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
       0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
  0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
       0, 0.0145,      0,      0,      0,      0,      0};
 
   int filterWidth = 7;
   int paddingPixels = (int)(filterWidth/2) * 2; 
   stoptime(start, "set up input, output.");
   start = clock();
   // Set up the OpenCL environment

   // Discovery platform
   cl_platform_id platform;
   clGetPlatformIDs(1, &platform, NULL);

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device,
      NULL);

    size_t time_res;
    clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
            sizeof(time_res), &time_res, NULL);
    printf("Device profiling timer resolution: %zu ns.\n", time_res);

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 
       (cl_context_properties)(platform), 0};
   cl_context context; 
   context = clCreateContext(props, 1, &device, NULL, NULL, 
      NULL);

   // Create command queue
   cl_ulong time_start, time_end, exec_time;
   cl_event timing_event;
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

   // Create memory buffers
   cl_mem d_inputImage;
   cl_mem d_outputImage;
   cl_mem d_filter;
   d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       deviceDataSize, NULL, NULL);
   d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
       deviceDataSize, NULL, NULL);
   d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       49*sizeof(float),NULL, NULL);
   
   // Write input data to the device
#ifdef NON_OPTIMIZED
   clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize,
       inputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   size_t buffer_origin[3] = {0,0,0};
   size_t host_origin[3] = {0,0,0};
   size_t region[3] = {deviceWidth*sizeof(float), 
      imageHeight, 1};
   clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0,
      inputImage, 0, NULL, NULL);
#endif
	
   // Write the filter to the device
   clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 
      49*sizeof(float), filter, 0, NULL, NULL);
	
   // Read in the program from file
   char* source = readSource("convolution.cl");

   // Create the program
   cl_program program;
	
   // Create and compile the program
   program = clCreateProgramWithSource(context, 1, 
       (const char**)&source, NULL, NULL);
   cl_int build_status;
   build_status = clBuildProgram(program, 1, &device, NULL, NULL,
      NULL);
      
   // Create the kernel
   cl_kernel kernel;
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   // Only the host-side code differs for the aligned reads
   kernel = clCreateKernel(program, "convolution", NULL);
#else // READ4
   kernel = clCreateKernel(program, "convolution_read4", NULL);
#endif
	
   // Selected work group size is 16x16
   int wgWidth = WGX;
   int wgHeight = WGY;

   // When computing the total number of work items, the 
   // padding work items do not need to be considered
   int totalWorkItemsX = roundUp(imageWidth-paddingPixels, 
      wgWidth);
   int totalWorkItemsY = roundUp(imageHeight-paddingPixels, 
      wgHeight);

   // Size of a work group
   size_t localSize[2] = {wgWidth, wgHeight};
   // Size of the NDRange
   size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY};

   // The amount of local data that is cached is the size of the
   // work groups plus the padding pixels
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   int localWidth = localSize[0] + paddingPixels;
#else // READ4
   // Round the local width up to 4 for the read4 kernel
   int localWidth = roundUp(localSize[0]+paddingPixels, 4);
#endif
   int localHeight = localSize[1] + paddingPixels;

   // Compute the size of local memory (needed for dynamic 
   // allocation)
   size_t localMemSize = (localWidth * localHeight * 
      sizeof(float));

   // Set the kernel arguments
   clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
   clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter);
   clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight);
   clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); 
   clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
   clSetKernelArg(kernel, 6, localMemSize, NULL);
   clSetKernelArg(kernel, 7, sizeof(int), &localHeight); 
   clSetKernelArg(kernel, 8, sizeof(int), &localWidth);

   stoptime(start, "set up kernel");
   start = clock();
   // Execute the kernel
   clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, 
      localSize, 0, NULL, &timing_event);

   // Wait for kernel to complete
   clFinish(queue);
   stoptime(start, "run kernel");
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
   exec_time = time_end-time_start;
   printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000);

   // Read back the output image
#ifdef NON_OPTIMIZED
   clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, 
      deviceDataSize, outputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   // Begin reading output from (3,3) on the device 
   // (for 7x7 filter with radius 3)
   buffer_origin[0] = 3*sizeof(float);
   buffer_origin[1] = 3;
   buffer_origin[2] = 0;

   // Read data into (3,3) on the host
   host_origin[0] = 3*sizeof(float);
   host_origin[1] = 3;
   host_origin[2] = 0;
	
   // Region is image size minus padding pixels
   region[0] = (imageWidth-paddingPixels)*sizeof(float);
   region[1] = (imageHeight-paddingPixels);
   region[2] = 1;
	
	// Perform the read
   clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, 
      outputImage, 0, NULL, NULL);
#endif
  
   // Homegrown function to write the image to file
   storeImage(outputImage, outputFile, imageHeight, 
      imageWidth, inputFile);
   
   // Free OpenCL objects
   clReleaseMemObject(d_inputImage);
   clReleaseMemObject(d_outputImage);
   clReleaseMemObject(d_filter);
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(queue);
   clReleaseContext(context);

   return 0;
}