示例#1
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
  dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)self->data;
  

  const int devid = piece->pipe->devid;
  const int width = roi_in->width;
  const int height = roi_in->height;

  cl_mem dev_U4 = NULL;

  cl_int err = -999;

  const int P = ceilf(d->radius * roi_in->scale / piece->iscale); // pixel filter size
  const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
  const float sharpness = 100000.0f/(1.0f+d->strength);

  if(P < 1)
  {
    size_t origin[] = { 0, 0, 0};
    size_t region[] = { width, height, 1};
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
    if (err != CL_SUCCESS) goto error;
    return TRUE;
  }

  float max_L = 120.0f, max_C = 512.0f;
  float nL = 1.0f/max_L, nC = 1.0f/max_C;
  float nL2 = nL*nL, nC2 = nC*nC;
  //float weight[4] = { powf(d->luma, 0.6), powf(d->chroma, 0.6), powf(d->chroma, 0.6), 1.0f };
  float weight[4] = { d->luma, d->chroma, d->chroma, 1.0f };

  dev_U4 = dt_opencl_alloc_device(devid, roi_out->width, roi_out->height, sizeof(float));
  if (dev_U4 == NULL) goto error;

  // prepare local work group
  size_t maxsizes[3] = { 0 };        // the maximum dimensions for a work group
  size_t workgroupsize = 0;          // the maximum number of items in a work group
  unsigned long localmemsize = 0;    // the maximum amount of local memory we can use
  size_t kernelworkgroupsize = 0;    // the maximum amount of items in work group of the kernel
                                     // assuming this is the same for nlmeans_horiz and nlmeans_vert 
  
  // make sure blocksize is not too large
  int blocksize = BLOCKSIZE;
  if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS &&
     dt_opencl_get_kernel_work_group_size(devid, gd->kernel_nlmeans_horiz, &kernelworkgroupsize) == CL_SUCCESS)
  {
    // reduce blocksize step by step until it fits to limits
    while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize
          || blocksize > workgroupsize || (blocksize+2*P)*sizeof(float) > localmemsize)
    {
      if(blocksize == 1) break;
      blocksize >>= 1;    
    }
  }
  else
  {
示例#2
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_sharpen_data_t *d = (dt_iop_sharpen_data_t *)piece->data;
  dt_iop_sharpen_global_data_t *gd = (dt_iop_sharpen_global_data_t *)self->data;
  cl_mem dev_m = NULL;
  cl_mem dev_tmp = NULL;
  cl_int err = -999;

  const int devid = piece->pipe->devid;
  const int width = roi_in->width;
  const int height = roi_in->height;
  const int rad = MIN(MAXR, ceilf(d->radius * roi_in->scale / piece->iscale));
  const int wd = 2*rad+1;
  float mat[wd];

  if(rad == 0)
  {
    size_t origin[] = {0, 0, 0};
    size_t region[] = {width, height, 1};
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
    if (err != CL_SUCCESS) goto error;
    return TRUE;
  }

  // init gaussian kernel
  float *m = mat + rad;
  const float sigma2 = (1.0f/(2.5*2.5))*(d->radius*roi_in->scale/piece->iscale)*(d->radius*roi_in->scale/piece->iscale);
  float weight = 0.0f;
  for(int l=-rad; l<=rad; l++) weight += m[l] = expf(- (l*l)/(2.f*sigma2));
  for(int l=-rad; l<=rad; l++) m[l] /= weight;


  // prepare local work group
  size_t maxsizes[3] = { 0 };        // the maximum dimensions for a work group
  size_t workgroupsize = 0;          // the maximum number of items in a work group
  unsigned long localmemsize = 0;    // the maximum amount of local memory we can use
  size_t kernelworkgroupsize = 0;    // the maximum amount of items in work group for this kernel

  // make sure blocksize is not too large
  int blocksize = BLOCKSIZE;
  if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS &&
      dt_opencl_get_kernel_work_group_size(devid, gd->kernel_sharpen_hblur, &kernelworkgroupsize) == CL_SUCCESS)
  {
    // reduce blocksize step by step until it fits to limits
    while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize
          || blocksize > workgroupsize || (blocksize+2*rad)*sizeof(float) > localmemsize)
    {
      if(blocksize == 1) break;
      blocksize >>= 1;
    }
  }
  else
  {
示例#3
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_develop_t *dev = self->dev;
  dt_iop_overexposed_global_data_t *gd = (dt_iop_overexposed_global_data_t *)self->data;

  cl_int err = -999;
  const int devid = piece->pipe->devid;

  const int width = roi_out->width;
  const int height = roi_out->height;

  const float lower  = dev->overexposed.lower / 100.0f;
  const float upper  = dev->overexposed.upper / 100.0f;
  const int colorscheme = dev->overexposed.colorscheme;

  const float *upper_color  = dt_iop_overexposed_colors[colorscheme][0];
  const float *lower_color = dt_iop_overexposed_colors[colorscheme][1];

  if(!dev->overexposed.enabled || !dev->gui_attached)
  {
    size_t origin[] = { 0, 0, 0};
    size_t region[] = { width, height, 1};
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
    if (err != CL_SUCCESS) goto error;
    return TRUE;
  }

  size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) };
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 0, sizeof(cl_mem), &dev_in);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 1, sizeof(cl_mem), &dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 2, sizeof(int), &width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 3, sizeof(int), &height);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 4, sizeof(float), &lower);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 5, sizeof(float), &upper);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 6, 4*sizeof(float), lower_color);
  dt_opencl_set_kernel_arg(devid, gd->kernel_overexposed, 7, 4*sizeof(float), upper_color);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_overexposed, sizes);
  if(err != CL_SUCCESS) goto error;
  return TRUE;

error:
  dt_print(DT_DEBUG_OPENCL, "[opencl_overexposed] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
示例#4
0
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out,
               const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  cl_int err = -999;
  const int devid = piece->pipe->devid;
  const int width = roi_in->width;
  const int height = roi_in->height;

  size_t origin[] = { 0, 0, 0 };
  size_t region[] = { width, height, 1 };
  err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
  if(err != CL_SUCCESS) goto error;

  return TRUE;

error:
  dt_print(DT_DEBUG_OPENCL, "[opencl_mask_manage] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
示例#5
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_borders_data_t *d = (dt_iop_borders_data_t *)piece->data;
  dt_iop_borders_global_data_t *gd = (dt_iop_borders_global_data_t *)self->data;

  cl_int err = -999;
  const int devid = piece->pipe->devid;

  const int width = roi_out->width;
  const int height = roi_out->height;

  const int bw = (piece->buf_out.width  - piece->buf_in.width ) * roi_in->scale;
  const int bh = (piece->buf_out.height - piece->buf_in.height) * roi_in->scale;
  const int bx = MAX(bw/2 - roi_out->x, 0);
  const int by = MAX(bh/2 - roi_out->y, 0);

  const float col[4] = {d->color[0], d->color[1], d->color[2], 1.0f};

  size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) };
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 0, sizeof(cl_mem), &dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 1, sizeof(int), &width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 2, sizeof(int), &height);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 3, 4*sizeof(float), &col);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_borders_fill, sizes);
  if(err != CL_SUCCESS) goto error;

  size_t iorigin[] = { 0, 0, 0};
  size_t oorigin[] = { bx, by, 0};
  size_t region[] = { roi_in->width, roi_in->height, 1};

  // copy original input from dev_in -> dev_out as starting point
  err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
  if(err != CL_SUCCESS) goto error;

  return TRUE;

error:
  dt_print(DT_DEBUG_OPENCL, "[opencl_borders] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
示例#6
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;
  dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)self->data;
  const int devid = piece->pipe->devid;
  const int width = roi_in->width;
  const int height = roi_in->height;

  cl_int err = -999;
  const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
  const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood

  if(P <= 1)
  {
    size_t origin[] = { 0, 0, 0};
    size_t region[] = { width, height, 1};
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
    if (err != CL_SUCCESS) goto error;
    return TRUE;
  }
  float max_L = 100.0f, max_C = 256.0f;
  float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  nL *= nL; nC *= nC;
  size_t sizes[] = { ROUNDUP(width, 4), ROUNDUP(height, 4), 1};
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 0, sizeof(cl_mem), (void *)&dev_in);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 1, sizeof(cl_mem), (void *)&dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 2, sizeof(int), (void *)&width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 3, sizeof(int), (void *)&height);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 4, sizeof(int32_t), (void *)&P);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 5, sizeof(int32_t), (void *)&K);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 6, sizeof(float), (void *)&nL);
  dt_opencl_set_kernel_arg(devid, gd->kernel_nlmeans, 7, sizeof(float), (void *)&nC);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_nlmeans, sizes);
  if(err != CL_SUCCESS) goto error;
  return TRUE;

error:
  dt_print(DT_DEBUG_OPENCL, "[opencl_nlmeans] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
示例#7
0
int
process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_borders_data_t *d = (dt_iop_borders_data_t *)piece->data;
  dt_iop_borders_global_data_t *gd = (dt_iop_borders_global_data_t *)self->data;

  cl_int err = -999;
  const int devid = piece->pipe->devid;

  const int width = roi_out->width;
  const int height = roi_out->height;

  const int border_tot_width  = (piece->buf_out.width  - piece->buf_in.width ) * roi_in->scale;
  const int border_tot_height = (piece->buf_out.height - piece->buf_in.height) * roi_in->scale;
  const int border_size_t = border_tot_height*d->pos_v;
  const int border_size_b = border_tot_height - border_size_t;
  const int border_size_l = border_tot_width*d->pos_h;
  const int border_size_r = border_tot_width - border_size_l;
  const int border_in_x = MAX(border_size_l - roi_out->x, 0);
  const int border_in_y = MAX(border_size_t - roi_out->y, 0);

  // ----- Filling border
  const float col[4] = {d->color[0], d->color[1], d->color[2], 1.0f};
  size_t sizes[2] = { ROUNDUPWD(width), ROUNDUPHT(height) };
  const int zero = 0;
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 0, sizeof(cl_mem), &dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 1, sizeof(int), &zero);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 2, sizeof(int), &zero);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 3, sizeof(int), &width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 4, sizeof(int), &height);
  dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 5, 4*sizeof(float), &col);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_borders_fill, sizes);
  if(err != CL_SUCCESS) goto error;

  // ----- Frame line
  const int border_min_size = MIN(MIN(border_size_t, border_size_b), MIN(border_size_l, border_size_r));
  const int frame_size  = border_min_size * d->frame_size;
  if (frame_size != 0)
  {
    const float col_frame[4] = {d->frame_color[0], d->frame_color[1], d->frame_color[2], 1.0f};
    const int image_lx = border_size_l - roi_out->x;
    const int image_ty = border_size_t - roi_out->y;
    const int frame_space  = border_min_size - frame_size;
    const int frame_offset = frame_space * d->frame_offset;
    const int frame_tl_in_x   = MAX(border_in_x - frame_offset, 0);
    const int frame_tl_out_x  = MAX(frame_tl_in_x - frame_size, 0);
    const int frame_tl_in_y   = MAX(border_in_y - frame_offset, 0);
    const int frame_tl_out_y  = MAX(frame_tl_in_y - frame_size, 0);
    const int frame_in_width  = floor((piece->buf_in.width * roi_in->scale) + frame_offset*2);
    const int frame_in_height  = floor((piece->buf_in.height * roi_in->scale) + frame_offset*2);
    const int frame_out_width  = frame_in_width + frame_size*2;
    const int frame_out_height  = frame_in_height + frame_size*2;
    const int frame_br_in_x = CLAMP(image_lx - frame_offset + frame_in_width, 0, roi_out->width);
    const int frame_br_in_y = CLAMP(image_ty - frame_offset + frame_in_height, 0, roi_out->height);
    // ... if 100% frame_offset we ensure frame_line "stick" the out border
    const int frame_br_out_x = (d->frame_offset == 1.0f)
                               ? (roi_out->width)
                               : CLAMP(image_lx - frame_offset - frame_size + frame_out_width, 0, roi_out->width);
    const int frame_br_out_y = (d->frame_offset == 1.0f)
                               ? (roi_out->height)
                               : CLAMP(image_ty - frame_offset - frame_size + frame_out_height, 0, roi_out->height);

    const int roi_frame_in_width = frame_br_in_x - frame_tl_in_x;
    const int roi_frame_in_height = frame_br_in_y - frame_tl_in_y;
    const int roi_frame_out_width = frame_br_out_x - frame_tl_out_x;
    const int roi_frame_out_height = frame_br_out_y - frame_tl_out_y;

    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 0, sizeof(cl_mem), &dev_out);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 1, sizeof(int), &frame_tl_out_x);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 2, sizeof(int), &frame_tl_out_y);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 3, sizeof(int), &roi_frame_out_width);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 4, sizeof(int), &roi_frame_out_height);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 5, 4*sizeof(float), &col_frame);
    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_borders_fill, sizes);
    if(err != CL_SUCCESS) goto error;

    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 0, sizeof(cl_mem), &dev_out);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 1, sizeof(int), &frame_tl_in_x);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 2, sizeof(int), &frame_tl_in_y);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 3, sizeof(int), &roi_frame_in_width);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 4, sizeof(int), &roi_frame_in_height);
    dt_opencl_set_kernel_arg(devid, gd->kernel_borders_fill, 5, 4*sizeof(float), &col);
    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_borders_fill, sizes);
    if(err != CL_SUCCESS) goto error;
  }

  size_t iorigin[] = { 0, 0, 0};
  size_t oorigin[] = { border_in_x, border_in_y, 0};
  size_t region[] = { roi_in->width, roi_in->height, 1};

  // copy original input from dev_in -> dev_out as starting point
  err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
  if(err != CL_SUCCESS) goto error;

  return TRUE;

error:
  dt_print(DT_DEBUG_OPENCL, "[opencl_borders] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}
示例#8
0
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out,
               const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  dt_iop_colorout_data_t *d = (dt_iop_colorout_data_t *)piece->data;
  dt_iop_colorout_global_data_t *gd = (dt_iop_colorout_global_data_t *)self->data;
  cl_mem dev_m = NULL, dev_r = NULL, dev_g = NULL, dev_b = NULL, dev_coeffs = NULL;

  cl_int err = -999;
  const int devid = piece->pipe->devid;
  const int width = roi_in->width;
  const int height = roi_in->height;

  if(d->type == DT_COLORSPACE_LAB)
  {
    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { roi_in->width, roi_in->height, 1 };
    err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
    if(err != CL_SUCCESS) goto error;
    return TRUE;
  }

  size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1 };

  dev_m = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, d->cmatrix);
  if(dev_m == NULL) goto error;
  dev_r = dt_opencl_copy_host_to_device(devid, d->lut[0], 256, 256, sizeof(float));
  if(dev_r == NULL) goto error;
  dev_g = dt_opencl_copy_host_to_device(devid, d->lut[1], 256, 256, sizeof(float));
  if(dev_g == NULL) goto error;
  dev_b = dt_opencl_copy_host_to_device(devid, d->lut[2], 256, 256, sizeof(float));
  if(dev_b == NULL) goto error;
  dev_coeffs
      = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3 * 3, (float *)d->unbounded_coeffs);
  if(dev_coeffs == NULL) goto error;
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 0, sizeof(cl_mem), (void *)&dev_in);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 1, sizeof(cl_mem), (void *)&dev_out);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 2, sizeof(int), (void *)&width);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 3, sizeof(int), (void *)&height);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 4, sizeof(cl_mem), (void *)&dev_m);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 5, sizeof(cl_mem), (void *)&dev_r);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 6, sizeof(cl_mem), (void *)&dev_g);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 7, sizeof(cl_mem), (void *)&dev_b);
  dt_opencl_set_kernel_arg(devid, gd->kernel_colorout, 8, sizeof(cl_mem), (void *)&dev_coeffs);
  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_colorout, sizes);
  if(err != CL_SUCCESS) goto error;
  dt_opencl_release_mem_object(dev_m);
  dt_opencl_release_mem_object(dev_r);
  dt_opencl_release_mem_object(dev_g);
  dt_opencl_release_mem_object(dev_b);
  dt_opencl_release_mem_object(dev_coeffs);
  return TRUE;

error:
  dt_opencl_release_mem_object(dev_m);
  dt_opencl_release_mem_object(dev_r);
  dt_opencl_release_mem_object(dev_g);
  dt_opencl_release_mem_object(dev_b);
  dt_opencl_release_mem_object(dev_coeffs);
  dt_print(DT_DEBUG_OPENCL, "[opencl_colorout] couldn't enqueue kernel! %d\n", err);
  return FALSE;
}