void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { dt_iop_rlce_data_t *data = (dt_iop_rlce_data_t *)piece->data; const int ch = piece->colors; // PASS1: Get a luminance map of image... float *luminance = (float *)malloc(((size_t)roi_out->width * roi_out->height) * sizeof(float)); // double lsmax=0.0,lsmin=1.0; #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(luminance) #endif for(int j = 0; j < roi_out->height; j++) { float *in = (float *)ivoid + (size_t)j * roi_out->width * ch; float *lm = luminance + (size_t)j * roi_out->width; for(int i = 0; i < roi_out->width; i++) { double pmax = CLIP(fmax(in[0], fmax(in[1], in[2]))); // Max value in RGB set double pmin = CLIP(fmin(in[0], fmin(in[1], in[2]))); // Min value in RGB set *lm = (pmax + pmin) / 2.0; // Pixel luminocity in += ch; lm++; } } // Params const int rad = data->radius * roi_in->scale / piece->iscale; #define BINS (256) const float slope = data->slope; const size_t destbuf_size = roi_out->width; float *const dest_buf = malloc(destbuf_size * sizeof(float) * dt_get_num_threads()); // CLAHE #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(luminance) #endif for(int j = 0; j < roi_out->height; j++) { int yMin = fmax(0, j - rad); int yMax = fmin(roi_in->height, j + rad + 1); int h = yMax - yMin; int xMin0 = fmax(0, 0 - rad); int xMax0 = fmin(roi_in->width - 1, rad); int hist[BINS + 1]; int clippedhist[BINS + 1]; float *dest = dest_buf + destbuf_size * dt_get_thread_num(); /* initially fill histogram */ memset(hist, 0, (BINS + 1) * sizeof(int)); for(int yi = yMin; yi < yMax; ++yi) for(int xi = xMin0; xi < xMax0; ++xi) ++hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xi] * (float)BINS)]; // Destination row memset(dest, 0, roi_out->width * sizeof(float)); float *ld = dest; for(int i = 0; i < roi_out->width; i++) { int v = ROUND_POSISTIVE(luminance[(size_t)j * roi_in->width + i] * (float)BINS); int xMin = fmax(0, i - rad); int xMax = i + rad + 1; int w = fmin(roi_in->width, xMax) - xMin; int n = h * w; int limit = (int)(slope * n / BINS + 0.5f); /* remove left behind values from histogram */ if(xMin > 0) { int xMin1 = xMin - 1; for(int yi = yMin; yi < yMax; ++yi) --hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xMin1] * (float)BINS)]; } /* add newly included values to histogram */ if(xMax <= roi_in->width) { int xMax1 = xMax - 1; for(int yi = yMin; yi < yMax; ++yi) ++hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xMax1] * (float)BINS)]; } /* clip histogram and redistribute clipped entries */ memcpy(clippedhist, hist, (BINS + 1) * sizeof(int)); int ce = 0, ceb = 0; do { ceb = ce; ce = 0; for(int b = 0; b <= BINS; b++) { int d = clippedhist[b] - limit; if(d > 0) { ce += d; clippedhist[b] = limit; } } int d = (ce / (float)(BINS + 1)); int m = ce % (BINS + 1); for(int b = 0; b <= BINS; b++) clippedhist[b] += d; if(m != 0) { int s = BINS / (float)m; for(int b = 0; b <= BINS; b += s) ++clippedhist[b]; } } while(ce != ceb); /* build cdf of clipped histogram */ int hMin = BINS; for(int b = 0; b < hMin; b++) if(clippedhist[b] != 0) hMin = b; int cdf = 0; for(int b = hMin; b <= v; b++) cdf += clippedhist[b]; int cdfMax = cdf; for(int b = v + 1; b <= BINS; b++) cdfMax += clippedhist[b]; int cdfMin = clippedhist[hMin]; *ld = (cdf - cdfMin) / (float)(cdfMax - cdfMin); ld++; } // Apply row float *in = ((float *)ivoid) + (size_t)j * roi_out->width * ch; float *out = ((float *)ovoid) + (size_t)j * roi_out->width * ch; for(int r = 0; r < roi_out->width; r++) { float H, S, L; rgb2hsl(in, &H, &S, &L); // hsl2rgb(out,H,S,( L / dest[r] ) * (L-lsmin) + lsmin ); hsl2rgb(out, H, S, dest[r]); out += ch; in += ch; ld++; } } free(dest_buf); // Cleanup free(luminance); #undef BINS }
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { dt_iop_bloom_data_t *data = (dt_iop_bloom_data_t *)piece->data; float *in = (float *)ivoid; float *out = (float *)ovoid; const int ch = piece->colors; /* gather light by threshold */ float *blurlightness = calloc((size_t)roi_out->width * roi_out->height, sizeof(float)); memcpy(out, in, (size_t)roi_out->width * roi_out->height * ch * sizeof(float)); const int rad = 256.0f * (fmin(100.0f, data->size + 1.0f) / 100.0f); const float _r = ceilf(rad * roi_in->scale / piece->iscale); const int radius = MIN(256.0f, _r); const float scale = 1.0f / exp2f(-1.0f * (fmin(100.0f, data->strength + 1.0f) / 100.0f)); /* get the thresholded lights into buffer */ #ifdef _OPENMP #pragma omp parallel for default(none) shared(data, blurlightness) schedule(static) #endif for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height; k++) { float *inp = ((float *)ivoid) + ch * k; const float L = inp[0] * scale; if(L > data->threshold) blurlightness[k] = L; } /* horizontal blur into memchannel lightness */ const int range = 2 * radius + 1; const int hr = range / 2; const size_t size = roi_out->width > roi_out->height ? roi_out->width : roi_out->height; float *const scanline_buf = malloc(size * dt_get_num_threads() * sizeof(float)); for(int iteration = 0; iteration < BOX_ITERATIONS; iteration++) { #ifdef _OPENMP #pragma omp parallel for default(none) shared(blurlightness) schedule(static) #endif for(int y = 0; y < roi_out->height; y++) { float *scanline = scanline_buf + size * dt_get_thread_num(); float L = 0; int hits = 0; const size_t index = (size_t)y * roi_out->width; for(int x = -hr; x < roi_out->width; x++) { int op = x - hr - 1; int np = x + hr; if(op >= 0) { L -= blurlightness[index + op]; hits--; } if(np < roi_out->width) { L += blurlightness[index + np]; hits++; } if(x >= 0) scanline[x] = L / hits; } for(int x = 0; x < roi_out->width; x++) blurlightness[index + x] = scanline[x]; } /* vertical pass on blurlightness */ const int opoffs = -(hr + 1) * roi_out->width; const int npoffs = (hr)*roi_out->width; #ifdef _OPENMP #pragma omp parallel for default(none) shared(blurlightness) schedule(static) #endif for(int x = 0; x < roi_out->width; x++) { float *scanline = scanline_buf + size * dt_get_thread_num(); float L = 0; int hits = 0; size_t index = (size_t)x - hr * roi_out->width; for(int y = -hr; y < roi_out->height; y++) { int op = y - hr - 1; int np = y + hr; if(op >= 0) { L -= blurlightness[index + opoffs]; hits--; } if(np < roi_out->height) { L += blurlightness[index + npoffs]; hits++; } if(y >= 0) scanline[y] = L / hits; index += roi_out->width; } for(int y = 0; y < roi_out->height; y++) blurlightness[y * roi_out->width + x] = scanline[y]; } } free(scanline_buf); /* screen blend lightness with original */ #ifdef _OPENMP #pragma omp parallel for default(none) shared(in, out, data, blurlightness) schedule(static) #endif for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height; k++) { float *inp = in + ch * k; float *outp = out + ch * k; outp[0] = 100.0f - (((100.0f - inp[0]) * (100.0f - blurlightness[k])) / 100.0f); // Screen blend outp[1] = inp[1]; outp[2] = inp[2]; } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); free(blurlightness); }
void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out) { const int width = g->width; const int height = g->height; const int ch = g->channels; float a0, a1, a2, a3, b1, b2, coefp, coefn; compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn); float *const temp = g->buf; float *const Labmax = g->max; float *const Labmin = g->min; float *const buf = malloc((size_t)9 * ch * dt_get_num_threads() * sizeof(float)); // vertical blur column by column #ifdef _OPENMP #pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static) #endif for(int i = 0; i < width; i++) { const int threadnum = dt_get_thread_num(); float *xp = buf + (size_t)9 * ch * threadnum + 0; float *yb = buf + (size_t)9 * ch * threadnum + 1; float *yp = buf + (size_t)9 * ch * threadnum + 2; float *xc = buf + (size_t)9 * ch * threadnum + 3; float *yc = buf + (size_t)9 * ch * threadnum + 4; float *xn = buf + (size_t)9 * ch * threadnum + 5; float *xa = buf + (size_t)9 * ch * threadnum + 6; float *yn = buf + (size_t)9 * ch * threadnum + 7; float *ya = buf + (size_t)9 * ch * threadnum + 8; // forward filter for(int k = 0; k < ch; k++) { xp[k] = CLAMPF(in[(size_t)i * ch + k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int j = 0; j < height; j++) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); temp[offset + k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k = 0; k < ch; k++) { xn[k] = CLAMPF(in[((size_t)(height - 1) * width + i) * ch + k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int j = height - 1; j > -1; j--) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; temp[offset + k] += yc[k]; } } } // horizontal blur line by line #ifdef _OPENMP #pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static) #endif for(int j = 0; j < height; j++) { const int threadnum = dt_get_thread_num(); float *xp = buf + (size_t)9 * ch * threadnum + 0; float *yb = buf + (size_t)9 * ch * threadnum + 1; float *yp = buf + (size_t)9 * ch * threadnum + 2; float *xc = buf + (size_t)9 * ch * threadnum + 3; float *yc = buf + (size_t)9 * ch * threadnum + 4; float *xn = buf + (size_t)9 * ch * threadnum + 5; float *xa = buf + (size_t)9 * ch * threadnum + 6; float *yn = buf + (size_t)9 * ch * threadnum + 7; float *ya = buf + (size_t)9 * ch * threadnum + 8; // forward filter for(int k = 0; k < ch; k++) { xp[k] = CLAMPF(temp[(size_t)j * width * ch + k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int i = 0; i < width; i++) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); out[offset + k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k = 0; k < ch; k++) { xn[k] = CLAMPF(temp[((size_t)(j + 1) * width - 1) * ch + k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int i = width - 1; i > -1; i--) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; out[offset + k] += yc[k]; } } } free(buf); }
/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. // get our data struct: dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data; // adjust to zoom size: const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood if(P <= 1) { // nothing to do from this distance: memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height); return; } // adjust to Lab, make L more important // float max_L = 100.0f, max_C = 256.0f; // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C); float max_L = 120.0f, max_C = 512.0f; float nL = 1.0f/max_L, nC = 1.0f/max_C; const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f }; float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads()); // we want to sum up weights in col[3], so need to init to 0: memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4); // for each shift vector for(int kj=-K;kj<=K;kj++) { for(int ki=-K;ki<=K;ki++) { int inited_slide = 0; // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors) // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory #ifdef _OPENMP # pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa) #endif for(int j=0; j<roi_out->height; j++) { if(j+kj < 0 || j+kj >= roi_out->height) continue; float *S = Sa + dt_get_thread_num() * roi_out->width; const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki); float *out = ((float *)ovoid) + 4*roi_out->width*j; const int Pm = MIN(MIN(P, j+kj), j); const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j); // first line of every thread // TODO: also every once in a while to assert numerical precision! if(!inited_slide) { // sum up a line memset(S, 0x0, sizeof(float)*roi_out->width); for(int jj=-Pm;jj<=PM;jj++) { int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; i<last; i++, inp+=4, inps+=4, s++) { for(int k=0;k<3;k++) s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k]; } } // only reuse this if we had a full stripe if(Pm == P && PM == P) inited_slide = 1; } // sliding window for this line: float *s = S; float slide = 0.0f; // sum up the first -P..P for(int i=0;i<2*P+1;i++) slide += s[i]; for(int i=0; i<roi_out->width; i++) { if(i-P > 0 && i+P<roi_out->width) slide += s[P] - s[-P-1]; if(i+ki >= 0 && i+ki < roi_out->width) { const __m128 iv = { ins[0], ins[1], ins[2], 1.0f }; _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide))); } s ++; ins += 4; out += 4; } if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height) { // sliding window in j direction: int i = MAX(0, -ki); float *s = S + i; const float *inp = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1); const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki); const float *inm = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P); const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki); const int last = roi_out->width + MIN(0, -ki); for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } /* Process most of the line 4 pixels at a time */ for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4) { __m128 sv = _mm_load_ps(s); const __m128 inp1 = _mm_load_ps(inp) - _mm_load_ps(inps); const __m128 inp2 = _mm_load_ps(inp+4) - _mm_load_ps(inps+4); const __m128 inp3 = _mm_load_ps(inp+8) - _mm_load_ps(inps+8); const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12); const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2); const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4); const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2); const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4); const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo); sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]); const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo); sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]); const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi); sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]); const __m128 inm1 = _mm_load_ps(inm) - _mm_load_ps(inms); const __m128 inm2 = _mm_load_ps(inm+4) - _mm_load_ps(inms+4); const __m128 inm3 = _mm_load_ps(inm+8) - _mm_load_ps(inms+8); const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12); const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2); const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4); const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2); const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4); const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo); sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]); const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo); sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]); const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi); sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]); _mm_store_ps(s, sv); } for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++) { float stmp = s[0]; for(int k=0;k<3;k++) stmp += ((inp[k] - inps[k])*(inp[k] - inps[k]) - (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k]; s[0] = stmp; } } else inited_slide = 0; } } } // normalize and apply chroma/luma blending // bias a bit towards higher values for low input values: const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6)); const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *out = ((float *)ovoid) + 4*roi_out->width*j; float *in = ((float *)ivoid) + 4*roi_out->width*j; for(int i=0; i<roi_out->width; i++) { _mm_store_ps(out, _mm_add_ps( _mm_mul_ps(_mm_load_ps(in), invert), _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3]))))); out += 4; in += 4; } } // free shared tmp memory: free(Sa); }
static void color_picker_helper_4ch_parallel(const dt_iop_buffer_dsc_t *dsc, const float *const pixel, const dt_iop_roi_t *roi, const int *const box, float *const picked_color, float *const picked_color_min, float *const picked_color_max, const dt_iop_colorspace_type_t cst_to) { const int width = roi->width; const size_t size = ((box[3] - box[1]) * (box[2] - box[0])); const float w = 1.0f / (float)size; const int numthreads = dt_get_num_threads(); float *const mean = malloc((size_t)3 * numthreads * sizeof(float)); float *const mmin = malloc((size_t)3 * numthreads * sizeof(float)); float *const mmax = malloc((size_t)3 * numthreads * sizeof(float)); for(int n = 0; n < 3 * numthreads; n++) { mean[n] = 0.0f; mmin[n] = INFINITY; mmax[n] = -INFINITY; } #ifdef _OPENMP #pragma omp parallel default(none) #endif { const int tnum = dt_get_thread_num(); float *const tmean = mean + 3 * tnum; float *const tmmin = mmin + 3 * tnum; float *const tmmax = mmax + 3 * tnum; #ifdef _OPENMP #pragma omp for schedule(static) collapse(2) #endif for(size_t j = box[1]; j < box[3]; j++) { for(size_t i = box[0]; i < box[2]; i++) { const size_t k = 4 * (width * j + i); float Lab[3] = { pixel[k], pixel[k + 1], pixel[k + 2] }; if(cst_to == iop_cs_LCh) dt_Lab_2_LCH(pixel + k, Lab); if(cst_to == iop_cs_HSL) dt_RGB_2_HSL(pixel + k, Lab); tmean[0] += w * Lab[0]; tmean[1] += w * Lab[1]; tmean[2] += w * Lab[2]; tmmin[0] = fminf(tmmin[0], Lab[0]); tmmin[1] = fminf(tmmin[1], Lab[1]); tmmin[2] = fminf(tmmin[2], Lab[2]); tmmax[0] = fmaxf(tmmax[0], Lab[0]); tmmax[1] = fmaxf(tmmax[1], Lab[1]); tmmax[2] = fmaxf(tmmax[2], Lab[2]); } } } for(int n = 0; n < numthreads; n++) { for(int k = 0; k < 3; k++) { picked_color[k] += mean[3 * n + k]; picked_color_min[k] = fminf(picked_color_min[k], mmin[3 * n + k]); picked_color_max[k] = fmaxf(picked_color_max[k], mmax[3 * n + k]); } } free(mmax); free(mmin); free(mean); }
static void color_picker_helper_xtrans_parallel(const dt_iop_buffer_dsc_t *const dsc, const float *const pixel, const dt_iop_roi_t *const roi, const int *const box, float *const picked_color, float *const picked_color_min, float *const picked_color_max) { const int width = roi->width; const uint8_t(*const xtrans)[6] = (const uint8_t(*const)[6])dsc->xtrans; uint32_t weights[3] = { 0u, 0u, 0u }; const int numthreads = dt_get_num_threads(); float *const msum = malloc((size_t)3 * numthreads * sizeof(float)); float *const mmin = malloc((size_t)3 * numthreads * sizeof(float)); float *const mmax = malloc((size_t)3 * numthreads * sizeof(float)); uint32_t *const cnt = malloc((size_t)3 * numthreads * sizeof(uint32_t)); for(int n = 0; n < 3 * numthreads; n++) { msum[n] = 0.0f; mmin[n] = INFINITY; mmax[n] = -INFINITY; cnt[n] = 0u; } #ifdef _OPENMP #pragma omp parallel default(none) #endif { const int tnum = dt_get_thread_num(); float *const tsum = msum + 3 * tnum; float *const tmmin = mmin + 3 * tnum; float *const tmmax = mmax + 3 * tnum; uint32_t *const tcnt = cnt + 3 * tnum; #ifdef _OPENMP #pragma omp for schedule(static) collapse(2) #endif for(size_t j = box[1]; j < box[3]; j++) { for(size_t i = box[0]; i < box[2]; i++) { const int c = FCxtrans(j, i, roi, xtrans); const size_t k = width * j + i; const float v = pixel[k]; tsum[c] += v; tmmin[c] = fminf(tmmin[c], v); tmmax[c] = fmaxf(tmmax[c], v); tcnt[c]++; } } } for(int n = 0; n < numthreads; n++) { for(int c = 0; c < 3; c++) { picked_color[c] += msum[3 * n + c]; picked_color_min[c] = fminf(picked_color_min[c], mmin[3 * n + c]); picked_color_max[c] = fmaxf(picked_color_max[c], mmax[3 * n + c]); weights[c] += cnt[3 * n + c]; } } free(cnt); free(mmax); free(mmin); free(msum); // and finally normalize data. // X-Trans RGB weighting averages to 2:5:2 for each 3x3 cell for(int c = 0; c < 3; c++) { picked_color[c] /= (float)weights[c]; } }
static void color_picker_helper_bayer_parallel(const dt_iop_buffer_dsc_t *const dsc, const float *const pixel, const dt_iop_roi_t *const roi, const int *const box, float *const picked_color, float *const picked_color_min, float *const picked_color_max) { const int width = roi->width; const uint32_t filters = dsc->filters; uint32_t weights[4] = { 0u, 0u, 0u, 0u }; const int numthreads = dt_get_num_threads(); float *const msum = malloc((size_t)4 * numthreads * sizeof(float)); float *const mmin = malloc((size_t)4 * numthreads * sizeof(float)); float *const mmax = malloc((size_t)4 * numthreads * sizeof(float)); uint32_t *const cnt = malloc((size_t)4 * numthreads * sizeof(uint32_t)); for(int n = 0; n < 4 * numthreads; n++) { msum[n] = 0.0f; mmin[n] = INFINITY; mmax[n] = -INFINITY; cnt[n] = 0u; } #ifdef _OPENMP #pragma omp parallel default(none) #endif { const int tnum = dt_get_thread_num(); float *const tsum = msum + 4 * tnum; float *const tmmin = mmin + 4 * tnum; float *const tmmax = mmax + 4 * tnum; uint32_t *const tcnt = cnt + 4 * tnum; #ifdef _OPENMP #pragma omp for schedule(static) collapse(2) #endif for(size_t j = box[1]; j < box[3]; j++) { for(size_t i = box[0]; i < box[2]; i++) { const int c = FC(j + roi->y, i + roi->x, filters); const size_t k = width * j + i; const float v = pixel[k]; tsum[c] += v; tmmin[c] = fminf(tmmin[c], v); tmmax[c] = fmaxf(tmmax[c], v); tcnt[c]++; } } } for(int n = 0; n < numthreads; n++) { for(int c = 0; c < 4; c++) { picked_color[c] += msum[4 * n + c]; picked_color_min[c] = fminf(picked_color_min[c], mmin[4 * n + c]); picked_color_max[c] = fmaxf(picked_color_max[c], mmax[4 * n + c]); weights[c] += cnt[4 * n + c]; } } free(cnt); free(mmax); free(mmin); free(msum); // and finally normalize data. For bayer, there is twice as much green. for(int c = 0; c < 4; c++) { picked_color[c] = weights[c] ? (picked_color[c] / (float)weights[c]) : 0.0f; } }