int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_highpass_data_t *d = (dt_iop_highpass_data_t *)piece->data; dt_iop_highpass_global_data_t *gd = (dt_iop_highpass_global_data_t *)self->data; cl_int err = -999; cl_mem dev_tmp = NULL; cl_mem dev_m = NULL; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; int rad = MAX_RADIUS * (fmin(100.0f, d->sharpness + 1) / 100.0f); const int radius = MIN(MAX_RADIUS, ceilf(rad * roi_in->scale / piece->iscale)); /* sigma-radius correlation to match opencl vs. non-opencl. identified by numerical experiments but * unproven. ask me if you need details. ulrich */ const float sigma = sqrt((radius * (radius + 1) * BOX_ITERATIONS + 2) / 3.0f); const int wdh = ceilf(3.0f * sigma); const int wd = 2 * wdh + 1; float mat[wd]; float *m = mat + wdh; float weight = 0.0f; // init gaussian kernel for(int l = -wdh; l <= wdh; l++) weight += m[l] = expf(-(l * l) / (2.f * sigma * sigma)); for(int l = -wdh; l <= wdh; l++) m[l] /= weight; // for(int l=-wdh; l<=wdh; l++) printf("%.6f ", (double)m[l]); // printf("\n"); float contrast_scale = ((d->contrast / 100.0f) * 7.5f); size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int blocksize = BLOCKSIZE; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_highpass_hblur, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize || blocksize > workgroupsize || (blocksize + 2 * wdh) * sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } } else {
int process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data; dt_iop_nlmeans_global_data_t *gd = (dt_iop_nlmeans_global_data_t *)self->data; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; cl_mem dev_U4 = NULL; cl_int err = -999; const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood if(P <= 1) { size_t origin[] = { 0, 0, 0}; size_t region[] = { width, height, 1}; err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region); if (err != CL_SUCCESS) goto error; return TRUE; } float max_L = 120.0f, max_C = 512.0f; float nL = 1.0f/max_L, nC = 1.0f/max_C; float nL2 = nL*nL, nC2 = nC*nC; float weight[4] = { powf(d->luma, 0.6), powf(d->chroma, 0.6), powf(d->chroma, 0.6), 1.0f }; dev_U4 = dt_opencl_alloc_device(devid, roi_out->width, roi_out->height, sizeof(float)); if (dev_U4 == NULL) goto error; // prepare local work group size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group of the kernel // assuming this is the same for nlmeans_horiz and nlmeans_vert // make sure blocksize is not too large int blocksize = BLOCKSIZE; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_nlmeans_horiz, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize || blocksize > workgroupsize || (blocksize+2*P)*sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } } else {
int process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_sharpen_data_t *d = (dt_iop_sharpen_data_t *)piece->data; dt_iop_sharpen_global_data_t *gd = (dt_iop_sharpen_global_data_t *)self->data; cl_mem dev_m = NULL; cl_mem dev_tmp = NULL; cl_int err = -999; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; const int rad = MIN(MAXR, ceilf(d->radius * roi_in->scale / piece->iscale)); const int wd = 2*rad+1; float mat[wd]; if(rad == 0) { size_t origin[] = {0, 0, 0}; size_t region[] = {width, height, 1}; err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region); if (err != CL_SUCCESS) goto error; return TRUE; } // init gaussian kernel float *m = mat + rad; const float sigma2 = (1.0f/(2.5*2.5))*(d->radius*roi_in->scale/piece->iscale)*(d->radius*roi_in->scale/piece->iscale); float weight = 0.0f; for(int l=-rad; l<=rad; l++) weight += m[l] = expf(- (l*l)/(2.f*sigma2)); for(int l=-rad; l<=rad; l++) m[l] /= weight; // prepare local work group size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int blocksize = BLOCKSIZE; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_sharpen_hblur, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize || blocksize > workgroupsize || (blocksize+2*rad)*sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } } else {
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_global_tonemap_data_t *d = (dt_iop_global_tonemap_data_t *)piece->data; dt_iop_global_tonemap_global_data_t *gd = (dt_iop_global_tonemap_global_data_t *)self->data; dt_iop_global_tonemap_gui_data_t *g = (dt_iop_global_tonemap_gui_data_t *)self->gui_data; dt_bilateral_cl_t *b = NULL; // check if we are in a tiling context and want OPERATOR_DRAGO. This does not work as drago // needs the maximum L-value of the whole image. Let's return FALSE, which will then fall back // to cpu processing if(piece->pipe->tiling && d->operator== OPERATOR_DRAGO) return FALSE; cl_int err = -999; cl_mem dev_m = NULL; cl_mem dev_r = NULL; const int devid = piece->pipe->devid; int gtkernel = -1; const int width = roi_out->width; const int height = roi_out->height; float parameters[4] = { 0.0f }; // prepare local work group size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int blocksize = BLOCKSIZE; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_pixelmax_first, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize * blocksize > kernelworkgroupsize || blocksize * blocksize > workgroupsize || blocksize * blocksize * sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } } else {
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_bloom_data_t *d = (dt_iop_bloom_data_t *)piece->data; const dt_iop_bloom_global_data_t *gd = (dt_iop_bloom_global_data_t *)self->data; cl_int err = -999; cl_mem dev_tmp[NUM_BUCKETS] = { NULL }; cl_mem dev_tmp1; cl_mem dev_tmp2; unsigned int state = 0; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; const float threshold = d->threshold; const int rad = 256.0f * (fmin(100.0f, d->size + 1.0f) / 100.0f); const float _r = ceilf(rad * roi_in->scale / piece->iscale); const int radius = MIN(256.0f, _r); const float scale = 1.0f / exp2f(-1.0f * (fmin(100.0f, d->strength + 1.0f) / 100.0f)); size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int blocksize = BLOCKSIZE; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_bloom_hblur, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize > kernelworkgroupsize || blocksize > workgroupsize || (blocksize + 2 * radius) * sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } } else {
int process_cl (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_lowpass_data_t *d = (dt_iop_lowpass_data_t *)piece->data; dt_iop_lowpass_global_data_t *gd = (dt_iop_lowpass_global_data_t *)self->data; cl_int err = -999; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; const int bpp = 4*sizeof(float); // check if we need to reduce blocksize size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int blocksize = BLOCKSIZE; int blockwd; int blockht; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_gaussian_transpose, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize*blocksize > workgroupsize || blocksize*(blocksize+1)*bpp > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } blockwd = blockht = blocksize; if(blockwd * blockht > kernelworkgroupsize) blockht = kernelworkgroupsize / blockwd; } else {
dt_gaussian_cl_t * dt_gaussian_init_cl( const int devid, const int width, // width of input image const int height, // height of input image const int channels, // channels per pixel const float *max, // maximum allowed values per channel for clamping const float *min, // minimum allowed values per channel for clamping const float sigma, // gaussian sigma const int order) // order of gaussian blur { assert(channels == 1 || channels == 4); if(!(channels == 1 || channels == 4)) return NULL; dt_gaussian_cl_t *g = (dt_gaussian_cl_t *)malloc(sizeof(dt_gaussian_cl_t)); if(!g) return NULL; g->global = darktable.opencl->gaussian; g->devid = devid; g->width = width; g->height = height; g->channels = channels; g->sigma = sigma; g->order = order; g->dev_temp1 = NULL; g->dev_temp2 = NULL; g->max = (float *)malloc(channels * sizeof(float)); g->min = (float *)malloc(channels * sizeof(float)); if(!g->min || !g->max) goto error; for(int k=0; k < channels; k++) { g->max[k] = max[k]; g->min[k] = min[k]; } // check if we need to reduce blocksize size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel // make sure blocksize is not too large int kernel_gaussian_transpose = (channels == 1) ? g->global->kernel_gaussian_transpose_1c : g->global->kernel_gaussian_transpose_4c; size_t blocksize = 64; int blockwd; int blockht; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, kernel_gaussian_transpose, &kernelworkgroupsize) == CL_SUCCESS) { // reduce blocksize step by step until it fits to limits while(blocksize > maxsizes[0] || blocksize > maxsizes[1] || blocksize*blocksize > workgroupsize || blocksize*(blocksize+1)*channels*sizeof(float) > localmemsize) { if(blocksize == 1) break; blocksize >>= 1; } blockwd = blockht = blocksize; if(blockwd * blockht > kernelworkgroupsize) blockht = kernelworkgroupsize / blockwd; } else {
int process_cl(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, cl_mem dev_out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { dt_iop_highlights_data_t *d = (dt_iop_highlights_data_t *)piece->data; dt_iop_highlights_global_data_t *gd = (dt_iop_highlights_global_data_t *)self->data; cl_int err = -999; cl_mem dev_xtrans = NULL; const int devid = piece->pipe->devid; const int width = roi_in->width; const int height = roi_in->height; const float clip = d->clip * fminf(piece->pipe->dsc.processed_maximum[0], fminf(piece->pipe->dsc.processed_maximum[1], piece->pipe->dsc.processed_maximum[2])); const uint32_t filters = piece->pipe->dsc.filters; if(!filters) { // non-raw images use dedicated kernel which just clips size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1 }; dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 0, sizeof(cl_mem), (void *)&dev_in); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 1, sizeof(cl_mem), (void *)&dev_out); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 2, sizeof(int), (void *)&width); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 3, sizeof(int), (void *)&height); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 4, sizeof(int), (void *)&d->mode); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_4f_clip, 5, sizeof(float), (void *)&clip); err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_highlights_4f_clip, sizes); if(err != CL_SUCCESS) goto error; } else if(d->mode == DT_IOP_HIGHLIGHTS_CLIP) { // raw images with clip mode (both bayer and xtrans) size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1 }; dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 0, sizeof(cl_mem), (void *)&dev_in); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 1, sizeof(cl_mem), (void *)&dev_out); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 2, sizeof(int), (void *)&width); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 3, sizeof(int), (void *)&height); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 4, sizeof(float), (void *)&clip); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 5, sizeof(int), (void *)&roi_out->x); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 6, sizeof(int), (void *)&roi_out->y); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_clip, 7, sizeof(int), (void *)&filters); err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_highlights_1f_clip, sizes); if(err != CL_SUCCESS) goto error; } else if(d->mode == DT_IOP_HIGHLIGHTS_LCH && filters != 9u) { // bayer sensor raws with LCH mode size_t sizes[] = { ROUNDUPWD(width), ROUNDUPHT(height), 1 }; dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 0, sizeof(cl_mem), (void *)&dev_in); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 1, sizeof(cl_mem), (void *)&dev_out); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 2, sizeof(int), (void *)&width); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 3, sizeof(int), (void *)&height); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 4, sizeof(float), (void *)&clip); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 5, sizeof(int), (void *)&roi_out->x); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 6, sizeof(int), (void *)&roi_out->y); dt_opencl_set_kernel_arg(devid, gd->kernel_highlights_1f_lch_bayer, 7, sizeof(int), (void *)&filters); err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_highlights_1f_lch_bayer, sizes); if(err != CL_SUCCESS) goto error; } else if(d->mode == DT_IOP_HIGHLIGHTS_LCH && filters == 9u) { // xtrans sensor raws with LCH mode // we use local buffering for speed reasons; determine suited work group size size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group unsigned long localmemsize = 0; // the maximum amount of local memory we can use size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel int blocksizex = 1 << 8; int blocksizey = 1 << 8; if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS && dt_opencl_get_kernel_work_group_size(devid, gd->kernel_highlights_1f_lch_xtrans, &kernelworkgroupsize) == CL_SUCCESS) { while(maxsizes[0] < blocksizex || maxsizes[1] < blocksizey || localmemsize < (blocksizex + 4) * (blocksizey + 4) * sizeof(float) || workgroupsize < blocksizex * blocksizey || kernelworkgroupsize < blocksizex * blocksizey) { if(blocksizex == 1 && blocksizey == 1) break; if(blocksizex > blocksizey) blocksizex >>= 1; else blocksizey >>= 1; } }