Exemple #1
0
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
             void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  dt_iop_rlce_data_t *data = (dt_iop_rlce_data_t *)piece->data;
  const int ch = piece->colors;

  // PASS1: Get a luminance map of image...
  float *luminance = (float *)malloc(((size_t)roi_out->width * roi_out->height) * sizeof(float));
// double lsmax=0.0,lsmin=1.0;
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static) shared(luminance)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    float *in = (float *)ivoid + (size_t)j * roi_out->width * ch;
    float *lm = luminance + (size_t)j * roi_out->width;
    for(int i = 0; i < roi_out->width; i++)
    {
      double pmax = CLIP(fmax(in[0], fmax(in[1], in[2]))); // Max value in RGB set
      double pmin = CLIP(fmin(in[0], fmin(in[1], in[2]))); // Min value in RGB set
      *lm = (pmax + pmin) / 2.0;                           // Pixel luminocity
      in += ch;
      lm++;
    }
  }


  // Params
  const int rad = data->radius * roi_in->scale / piece->iscale;

#define BINS (256)

  const float slope = data->slope;

  const size_t destbuf_size = roi_out->width;
  float *const dest_buf = malloc(destbuf_size * sizeof(float) * dt_get_num_threads());

// CLAHE
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static) shared(luminance)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    int yMin = fmax(0, j - rad);
    int yMax = fmin(roi_in->height, j + rad + 1);
    int h = yMax - yMin;

    int xMin0 = fmax(0, 0 - rad);
    int xMax0 = fmin(roi_in->width - 1, rad);

    int hist[BINS + 1];
    int clippedhist[BINS + 1];

    float *dest = dest_buf + destbuf_size * dt_get_thread_num();

    /* initially fill histogram */
    memset(hist, 0, (BINS + 1) * sizeof(int));
    for(int yi = yMin; yi < yMax; ++yi)
      for(int xi = xMin0; xi < xMax0; ++xi)
        ++hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xi] * (float)BINS)];

    // Destination row
    memset(dest, 0, roi_out->width * sizeof(float));
    float *ld = dest;

    for(int i = 0; i < roi_out->width; i++)
    {

      int v = ROUND_POSISTIVE(luminance[(size_t)j * roi_in->width + i] * (float)BINS);

      int xMin = fmax(0, i - rad);
      int xMax = i + rad + 1;
      int w = fmin(roi_in->width, xMax) - xMin;
      int n = h * w;

      int limit = (int)(slope * n / BINS + 0.5f);

      /* remove left behind values from histogram */
      if(xMin > 0)
      {
        int xMin1 = xMin - 1;
        for(int yi = yMin; yi < yMax; ++yi)
          --hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xMin1] * (float)BINS)];
      }

      /* add newly included values to histogram */
      if(xMax <= roi_in->width)
      {
        int xMax1 = xMax - 1;
        for(int yi = yMin; yi < yMax; ++yi)
          ++hist[ROUND_POSISTIVE(luminance[(size_t)yi * roi_in->width + xMax1] * (float)BINS)];
      }

      /* clip histogram and redistribute clipped entries */
      memcpy(clippedhist, hist, (BINS + 1) * sizeof(int));
      int ce = 0, ceb = 0;
      do
      {
        ceb = ce;
        ce = 0;
        for(int b = 0; b <= BINS; b++)
        {
          int d = clippedhist[b] - limit;
          if(d > 0)
          {
            ce += d;
            clippedhist[b] = limit;
          }
        }

        int d = (ce / (float)(BINS + 1));
        int m = ce % (BINS + 1);
        for(int b = 0; b <= BINS; b++) clippedhist[b] += d;

        if(m != 0)
        {
          int s = BINS / (float)m;
          for(int b = 0; b <= BINS; b += s) ++clippedhist[b];
        }
      } while(ce != ceb);

      /* build cdf of clipped histogram */
      int hMin = BINS;
      for(int b = 0; b < hMin; b++)
        if(clippedhist[b] != 0) hMin = b;

      int cdf = 0;
      for(int b = hMin; b <= v; b++) cdf += clippedhist[b];

      int cdfMax = cdf;
      for(int b = v + 1; b <= BINS; b++) cdfMax += clippedhist[b];

      int cdfMin = clippedhist[hMin];

      *ld = (cdf - cdfMin) / (float)(cdfMax - cdfMin);

      ld++;
    }

    // Apply row
    float *in = ((float *)ivoid) + (size_t)j * roi_out->width * ch;
    float *out = ((float *)ovoid) + (size_t)j * roi_out->width * ch;
    for(int r = 0; r < roi_out->width; r++)
    {
      float H, S, L;
      rgb2hsl(in, &H, &S, &L);
      // hsl2rgb(out,H,S,( L / dest[r] ) * (L-lsmin) + lsmin );
      hsl2rgb(out, H, S, dest[r]);
      out += ch;
      in += ch;
      ld++;
    }
  }

  free(dest_buf);

  // Cleanup
  free(luminance);

#undef BINS
}
Exemple #2
0
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
             void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  dt_iop_bloom_data_t *data = (dt_iop_bloom_data_t *)piece->data;
  float *in = (float *)ivoid;
  float *out = (float *)ovoid;
  const int ch = piece->colors;

  /* gather light by threshold */
  float *blurlightness = calloc((size_t)roi_out->width * roi_out->height, sizeof(float));
  memcpy(out, in, (size_t)roi_out->width * roi_out->height * ch * sizeof(float));

  const int rad = 256.0f * (fmin(100.0f, data->size + 1.0f) / 100.0f);
  const float _r = ceilf(rad * roi_in->scale / piece->iscale);
  const int radius = MIN(256.0f, _r);

  const float scale = 1.0f / exp2f(-1.0f * (fmin(100.0f, data->strength + 1.0f) / 100.0f));

/* get the thresholded lights into buffer */
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(data, blurlightness) schedule(static)
#endif
  for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height; k++)
  {
    float *inp = ((float *)ivoid) + ch * k;
    const float L = inp[0] * scale;
    if(L > data->threshold) blurlightness[k] = L;
  }


  /* horizontal blur into memchannel lightness */
  const int range = 2 * radius + 1;
  const int hr = range / 2;

  const size_t size = roi_out->width > roi_out->height ? roi_out->width : roi_out->height;
  float *const scanline_buf = malloc(size * dt_get_num_threads() * sizeof(float));

  for(int iteration = 0; iteration < BOX_ITERATIONS; iteration++)
  {
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(blurlightness) schedule(static)
#endif
    for(int y = 0; y < roi_out->height; y++)
    {
      float *scanline = scanline_buf + size * dt_get_thread_num();
      float L = 0;
      int hits = 0;
      const size_t index = (size_t)y * roi_out->width;
      for(int x = -hr; x < roi_out->width; x++)
      {
        int op = x - hr - 1;
        int np = x + hr;
        if(op >= 0)
        {
          L -= blurlightness[index + op];
          hits--;
        }
        if(np < roi_out->width)
        {
          L += blurlightness[index + np];
          hits++;
        }
        if(x >= 0) scanline[x] = L / hits;
      }

      for(int x = 0; x < roi_out->width; x++) blurlightness[index + x] = scanline[x];
    }

    /* vertical pass on blurlightness */
    const int opoffs = -(hr + 1) * roi_out->width;
    const int npoffs = (hr)*roi_out->width;


#ifdef _OPENMP
#pragma omp parallel for default(none) shared(blurlightness) schedule(static)
#endif
    for(int x = 0; x < roi_out->width; x++)
    {
      float *scanline = scanline_buf + size * dt_get_thread_num();
      float L = 0;
      int hits = 0;
      size_t index = (size_t)x - hr * roi_out->width;
      for(int y = -hr; y < roi_out->height; y++)
      {
        int op = y - hr - 1;
        int np = y + hr;

        if(op >= 0)
        {
          L -= blurlightness[index + opoffs];
          hits--;
        }
        if(np < roi_out->height)
        {
          L += blurlightness[index + npoffs];
          hits++;
        }
        if(y >= 0) scanline[y] = L / hits;
        index += roi_out->width;
      }

      for(int y = 0; y < roi_out->height; y++) blurlightness[y * roi_out->width + x] = scanline[y];
    }
  }
  free(scanline_buf);

/* screen blend lightness with original */
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(in, out, data, blurlightness) schedule(static)
#endif
  for(size_t k = 0; k < (size_t)roi_out->width * roi_out->height; k++)
  {
    float *inp = in + ch * k;
    float *outp = out + ch * k;
    outp[0] = 100.0f - (((100.0f - inp[0]) * (100.0f - blurlightness[k])) / 100.0f); // Screen blend
    outp[1] = inp[1];
    outp[2] = inp[2];
  }

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);

  free(blurlightness);
}
Exemple #3
0
void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out)
{

  const int width = g->width;
  const int height = g->height;
  const int ch = g->channels;

  float a0, a1, a2, a3, b1, b2, coefp, coefn;

  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);

  float *const temp = g->buf;

  float *const Labmax = g->max;
  float *const Labmin = g->min;

  float *const buf = malloc((size_t)9 * ch * dt_get_num_threads() * sizeof(float));

// vertical blur column by column
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static)
#endif
  for(int i = 0; i < width; i++)
  {
    const int threadnum = dt_get_thread_num();
    float *xp = buf + (size_t)9 * ch * threadnum + 0;
    float *yb = buf + (size_t)9 * ch * threadnum + 1;
    float *yp = buf + (size_t)9 * ch * threadnum + 2;
    float *xc = buf + (size_t)9 * ch * threadnum + 3;
    float *yc = buf + (size_t)9 * ch * threadnum + 4;
    float *xn = buf + (size_t)9 * ch * threadnum + 5;
    float *xa = buf + (size_t)9 * ch * threadnum + 6;
    float *yn = buf + (size_t)9 * ch * threadnum + 7;
    float *ya = buf + (size_t)9 * ch * threadnum + 8;

    // forward filter
    for(int k = 0; k < ch; k++)
    {
      xp[k] = CLAMPF(in[(size_t)i * ch + k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }

    for(int j = 0; j < height; j++)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        temp[offset + k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k = 0; k < ch; k++)
    {
      xn[k] = CLAMPF(in[((size_t)(height - 1) * width + i) * ch + k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int j = height - 1; j > -1; j--)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k];
        xn[k] = xc[k];
        ya[k] = yn[k];
        yn[k] = yc[k];

        temp[offset + k] += yc[k];
      }
    }
  }

// horizontal blur line by line
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static)
#endif
  for(int j = 0; j < height; j++)
  {
    const int threadnum = dt_get_thread_num();
    float *xp = buf + (size_t)9 * ch * threadnum + 0;
    float *yb = buf + (size_t)9 * ch * threadnum + 1;
    float *yp = buf + (size_t)9 * ch * threadnum + 2;
    float *xc = buf + (size_t)9 * ch * threadnum + 3;
    float *yc = buf + (size_t)9 * ch * threadnum + 4;
    float *xn = buf + (size_t)9 * ch * threadnum + 5;
    float *xa = buf + (size_t)9 * ch * threadnum + 6;
    float *yn = buf + (size_t)9 * ch * threadnum + 7;
    float *ya = buf + (size_t)9 * ch * threadnum + 8;


    // forward filter
    for(int k = 0; k < ch; k++)
    {
      xp[k] = CLAMPF(temp[(size_t)j * width * ch + k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }

    for(int i = 0; i < width; i++)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        out[offset + k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k = 0; k < ch; k++)
    {
      xn[k] = CLAMPF(temp[((size_t)(j + 1) * width - 1) * ch + k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int i = width - 1; i > -1; i--)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k];
        xn[k] = xc[k];
        ya[k] = yn[k];
        yn[k] = yc[k];

        out[offset + k] += yc[k];
      }
    }
  }

  free(buf);
}
Exemple #4
0
/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  // get our data struct:
  dt_iop_nlmeans_params_t *d = (dt_iop_nlmeans_params_t *)piece->data;

  // adjust to zoom size:
  const int P = ceilf(3 * roi_in->scale / piece->iscale); // pixel filter size
  const int K = ceilf(7 * roi_in->scale / piece->iscale); // nbhood
  if(P <= 1)
  {
    // nothing to do from this distance:
    memcpy (ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);
    return;
  }

  // adjust to Lab, make L more important
  // float max_L = 100.0f, max_C = 256.0f;
  // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
  float max_L = 120.0f, max_C = 512.0f;
  float nL = 1.0f/max_L, nC = 1.0f/max_C;
  const float norm2[4] = { nL*nL, nC*nC, nC*nC, 1.0f };

  float *Sa = dt_alloc_align(64, sizeof(float)*roi_out->width*dt_get_num_threads());
  // we want to sum up weights in col[3], so need to init to 0:
  memset(ovoid, 0x0, sizeof(float)*roi_out->width*roi_out->height*4);

  // for each shift vector
  for(int kj=-K;kj<=K;kj++)
  {
    for(int ki=-K;ki<=K;ki++)
    {
      int inited_slide = 0;
      // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else we will add up errors)
      // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more memory
#ifdef _OPENMP
#  pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, roi_out, roi_in, ivoid, ovoid, Sa)
#endif
      for(int j=0; j<roi_out->height; j++)
      {
        if(j+kj < 0 || j+kj >= roi_out->height) continue;
        float *S = Sa + dt_get_thread_num() * roi_out->width;
        const float *ins = ((float *)ivoid) + 4*(roi_in->width *(j+kj) + ki);
        float *out = ((float *)ovoid) + 4*roi_out->width*j;

        const int Pm = MIN(MIN(P, j+kj), j);
        const int PM = MIN(MIN(P, roi_out->height-1-j-kj), roi_out->height-1-j);
        // first line of every thread
        // TODO: also every once in a while to assert numerical precision!
        if(!inited_slide)
        {
          // sum up a line 
          memset(S, 0x0, sizeof(float)*roi_out->width);
          for(int jj=-Pm;jj<=PM;jj++)
          {
            int i = MAX(0, -ki);
            float *s = S + i;
            const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+jj);
            const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+jj+kj) + ki);
            const int last = roi_out->width + MIN(0, -ki);
            for(; i<last; i++, inp+=4, inps+=4, s++)
            {
              for(int k=0;k<3;k++)
                s[0] += (inp[k] - inps[k])*(inp[k] - inps[k]) * norm2[k];
            }
          }
          // only reuse this if we had a full stripe
          if(Pm == P && PM == P) inited_slide = 1;
        }

        // sliding window for this line:
        float *s = S;
        float slide = 0.0f;
        // sum up the first -P..P
        for(int i=0;i<2*P+1;i++) slide += s[i];
        for(int i=0; i<roi_out->width; i++)
        {
          if(i-P > 0 && i+P<roi_out->width)
            slide += s[P] - s[-P-1];
          if(i+ki >= 0 && i+ki < roi_out->width)
          {
            const __m128 iv = { ins[0], ins[1], ins[2], 1.0f };
            _mm_store_ps(out, _mm_load_ps(out) + iv * _mm_set1_ps(gh(slide)));
          }
          s   ++;
          ins += 4;
          out += 4;
        }
        if(inited_slide && j+P+1+MAX(0,kj) < roi_out->height)
        {
          // sliding window in j direction:
          int i = MAX(0, -ki);
          float *s = S + i;
          const float *inp  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j+P+1);
          const float *inps = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j+P+1+kj) + ki);
          const float *inm  = ((float *)ivoid) + 4*i + 4* roi_in->width *(j-P);
          const float *inms = ((float *)ivoid) + 4*i + 4*(roi_in->width *(j-P+kj) + ki);
          const int last = roi_out->width + MIN(0, -ki);
          for(; ((unsigned long)s & 0xf) != 0 && i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
          /* Process most of the line 4 pixels at a time */
          for(; i<last-4; i+=4, inp+=16, inps+=16, inm+=16, inms+=16, s+=4)
          {
            __m128 sv = _mm_load_ps(s);
            const __m128 inp1 = _mm_load_ps(inp)    - _mm_load_ps(inps);
            const __m128 inp2 = _mm_load_ps(inp+4)  - _mm_load_ps(inps+4);
            const __m128 inp3 = _mm_load_ps(inp+8)  - _mm_load_ps(inps+8);
            const __m128 inp4 = _mm_load_ps(inp+12) - _mm_load_ps(inps+12);

            const __m128 inp12lo = _mm_unpacklo_ps(inp1,inp2);
            const __m128 inp34lo = _mm_unpacklo_ps(inp3,inp4);
            const __m128 inp12hi = _mm_unpackhi_ps(inp1,inp2);
            const __m128 inp34hi = _mm_unpackhi_ps(inp3,inp4);

            const __m128 inpv0 = _mm_movelh_ps(inp12lo,inp34lo);
            sv += inpv0*inpv0 * _mm_set1_ps(norm2[0]);

            const __m128 inpv1 = _mm_movehl_ps(inp34lo,inp12lo);
            sv += inpv1*inpv1 * _mm_set1_ps(norm2[1]);

            const __m128 inpv2 = _mm_movelh_ps(inp12hi,inp34hi);
            sv += inpv2*inpv2 * _mm_set1_ps(norm2[2]);

            const __m128 inm1 = _mm_load_ps(inm)    - _mm_load_ps(inms);
            const __m128 inm2 = _mm_load_ps(inm+4)  - _mm_load_ps(inms+4);
            const __m128 inm3 = _mm_load_ps(inm+8)  - _mm_load_ps(inms+8);
            const __m128 inm4 = _mm_load_ps(inm+12) - _mm_load_ps(inms+12);

            const __m128 inm12lo = _mm_unpacklo_ps(inm1,inm2);
            const __m128 inm34lo = _mm_unpacklo_ps(inm3,inm4);
            const __m128 inm12hi = _mm_unpackhi_ps(inm1,inm2);
            const __m128 inm34hi = _mm_unpackhi_ps(inm3,inm4);

            const __m128 inmv0 = _mm_movelh_ps(inm12lo,inm34lo);
            sv -= inmv0*inmv0 * _mm_set1_ps(norm2[0]);

            const __m128 inmv1 = _mm_movehl_ps(inm34lo,inm12lo);
            sv -= inmv1*inmv1 * _mm_set1_ps(norm2[1]);

            const __m128 inmv2 = _mm_movelh_ps(inm12hi,inm34hi);
            sv -= inmv2*inmv2 * _mm_set1_ps(norm2[2]);

            _mm_store_ps(s, sv);
          }
          for(; i<last; i++, inp+=4, inps+=4, inm+=4, inms+=4, s++)
          {
            float stmp = s[0];
            for(int k=0;k<3;k++)
              stmp += ((inp[k] - inps[k])*(inp[k] - inps[k])
                    -  (inm[k] - inms[k])*(inm[k] - inms[k])) * norm2[k];
            s[0] = stmp;
          }
        }
        else inited_slide = 0;
      }
    }
  }
  // normalize and apply chroma/luma blending
  // bias a bit towards higher values for low input values:
  const __m128 weight = _mm_set_ps(1.0f, powf(d->chroma, 0.6), powf(d->chroma, 0.6), powf(d->luma, 0.6));
  const __m128 invert = _mm_sub_ps(_mm_set1_ps(1.0f), weight);
#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid,ivoid,roi_out,d)
#endif
  for(int j=0; j<roi_out->height; j++)
  {
    float *out = ((float *)ovoid) + 4*roi_out->width*j;
    float *in  = ((float *)ivoid) + 4*roi_out->width*j;
    for(int i=0; i<roi_out->width; i++)
    {
      _mm_store_ps(out, _mm_add_ps(
          _mm_mul_ps(_mm_load_ps(in),  invert),
          _mm_mul_ps(_mm_load_ps(out), _mm_div_ps(weight, _mm_set1_ps(out[3])))));
      out += 4;
      in  += 4;
    }
  }
  // free shared tmp memory:
  free(Sa);
}
Exemple #5
0
static void color_picker_helper_4ch_parallel(const dt_iop_buffer_dsc_t *dsc, const float *const pixel,
                                             const dt_iop_roi_t *roi, const int *const box,
                                             float *const picked_color, float *const picked_color_min,
                                             float *const picked_color_max, const dt_iop_colorspace_type_t cst_to)
{
  const int width = roi->width;

  const size_t size = ((box[3] - box[1]) * (box[2] - box[0]));

  const float w = 1.0f / (float)size;

  const int numthreads = dt_get_num_threads();

  float *const mean = malloc((size_t)3 * numthreads * sizeof(float));
  float *const mmin = malloc((size_t)3 * numthreads * sizeof(float));
  float *const mmax = malloc((size_t)3 * numthreads * sizeof(float));

  for(int n = 0; n < 3 * numthreads; n++)
  {
    mean[n] = 0.0f;
    mmin[n] = INFINITY;
    mmax[n] = -INFINITY;
  }

#ifdef _OPENMP
#pragma omp parallel default(none)
#endif
  {
    const int tnum = dt_get_thread_num();

    float *const tmean = mean + 3 * tnum;
    float *const tmmin = mmin + 3 * tnum;
    float *const tmmax = mmax + 3 * tnum;

#ifdef _OPENMP
#pragma omp for schedule(static) collapse(2)
#endif
    for(size_t j = box[1]; j < box[3]; j++)
    {
      for(size_t i = box[0]; i < box[2]; i++)
      {
        const size_t k = 4 * (width * j + i);
        float Lab[3] = { pixel[k], pixel[k + 1], pixel[k + 2] };
        if(cst_to == iop_cs_LCh) dt_Lab_2_LCH(pixel + k, Lab);
        if(cst_to == iop_cs_HSL) dt_RGB_2_HSL(pixel + k, Lab);
        tmean[0] += w * Lab[0];
        tmean[1] += w * Lab[1];
        tmean[2] += w * Lab[2];
        tmmin[0] = fminf(tmmin[0], Lab[0]);
        tmmin[1] = fminf(tmmin[1], Lab[1]);
        tmmin[2] = fminf(tmmin[2], Lab[2]);
        tmmax[0] = fmaxf(tmmax[0], Lab[0]);
        tmmax[1] = fmaxf(tmmax[1], Lab[1]);
        tmmax[2] = fmaxf(tmmax[2], Lab[2]);
      }
    }
  }

  for(int n = 0; n < numthreads; n++)
  {
    for(int k = 0; k < 3; k++)
    {
      picked_color[k] += mean[3 * n + k];
      picked_color_min[k] = fminf(picked_color_min[k], mmin[3 * n + k]);
      picked_color_max[k] = fmaxf(picked_color_max[k], mmax[3 * n + k]);
    }
  }

  free(mmax);
  free(mmin);
  free(mean);
}
Exemple #6
0
static void color_picker_helper_xtrans_parallel(const dt_iop_buffer_dsc_t *const dsc, const float *const pixel,
                                                const dt_iop_roi_t *const roi, const int *const box,
                                                float *const picked_color, float *const picked_color_min,
                                                float *const picked_color_max)
{
  const int width = roi->width;
  const uint8_t(*const xtrans)[6] = (const uint8_t(*const)[6])dsc->xtrans;

  uint32_t weights[3] = { 0u, 0u, 0u };

  const int numthreads = dt_get_num_threads();

  float *const msum = malloc((size_t)3 * numthreads * sizeof(float));
  float *const mmin = malloc((size_t)3 * numthreads * sizeof(float));
  float *const mmax = malloc((size_t)3 * numthreads * sizeof(float));
  uint32_t *const cnt = malloc((size_t)3 * numthreads * sizeof(uint32_t));

  for(int n = 0; n < 3 * numthreads; n++)
  {
    msum[n] = 0.0f;
    mmin[n] = INFINITY;
    mmax[n] = -INFINITY;
    cnt[n] = 0u;
  }

#ifdef _OPENMP
#pragma omp parallel default(none)
#endif
  {
    const int tnum = dt_get_thread_num();

    float *const tsum = msum + 3 * tnum;
    float *const tmmin = mmin + 3 * tnum;
    float *const tmmax = mmax + 3 * tnum;
    uint32_t *const tcnt = cnt + 3 * tnum;

#ifdef _OPENMP
#pragma omp for schedule(static) collapse(2)
#endif
    for(size_t j = box[1]; j < box[3]; j++)
    {
      for(size_t i = box[0]; i < box[2]; i++)
      {
        const int c = FCxtrans(j, i, roi, xtrans);
        const size_t k = width * j + i;

        const float v = pixel[k];

        tsum[c] += v;
        tmmin[c] = fminf(tmmin[c], v);
        tmmax[c] = fmaxf(tmmax[c], v);
        tcnt[c]++;
      }
    }
  }

  for(int n = 0; n < numthreads; n++)
  {
    for(int c = 0; c < 3; c++)
    {
      picked_color[c] += msum[3 * n + c];
      picked_color_min[c] = fminf(picked_color_min[c], mmin[3 * n + c]);
      picked_color_max[c] = fmaxf(picked_color_max[c], mmax[3 * n + c]);
      weights[c] += cnt[3 * n + c];
    }
  }

  free(cnt);
  free(mmax);
  free(mmin);
  free(msum);

  // and finally normalize data.
  // X-Trans RGB weighting averages to 2:5:2 for each 3x3 cell
  for(int c = 0; c < 3; c++)
  {
    picked_color[c] /= (float)weights[c];
  }
}
Exemple #7
0
static void color_picker_helper_bayer_parallel(const dt_iop_buffer_dsc_t *const dsc, const float *const pixel,
                                               const dt_iop_roi_t *const roi, const int *const box,
                                               float *const picked_color, float *const picked_color_min,
                                               float *const picked_color_max)
{
  const int width = roi->width;
  const uint32_t filters = dsc->filters;

  uint32_t weights[4] = { 0u, 0u, 0u, 0u };

  const int numthreads = dt_get_num_threads();

  float *const msum = malloc((size_t)4 * numthreads * sizeof(float));
  float *const mmin = malloc((size_t)4 * numthreads * sizeof(float));
  float *const mmax = malloc((size_t)4 * numthreads * sizeof(float));
  uint32_t *const cnt = malloc((size_t)4 * numthreads * sizeof(uint32_t));

  for(int n = 0; n < 4 * numthreads; n++)
  {
    msum[n] = 0.0f;
    mmin[n] = INFINITY;
    mmax[n] = -INFINITY;
    cnt[n] = 0u;
  }

#ifdef _OPENMP
#pragma omp parallel default(none)
#endif
  {
    const int tnum = dt_get_thread_num();

    float *const tsum = msum + 4 * tnum;
    float *const tmmin = mmin + 4 * tnum;
    float *const tmmax = mmax + 4 * tnum;
    uint32_t *const tcnt = cnt + 4 * tnum;

#ifdef _OPENMP
#pragma omp for schedule(static) collapse(2)
#endif
    for(size_t j = box[1]; j < box[3]; j++)
    {
      for(size_t i = box[0]; i < box[2]; i++)
      {
        const int c = FC(j + roi->y, i + roi->x, filters);
        const size_t k = width * j + i;

        const float v = pixel[k];

        tsum[c] += v;
        tmmin[c] = fminf(tmmin[c], v);
        tmmax[c] = fmaxf(tmmax[c], v);
        tcnt[c]++;
      }
    }
  }

  for(int n = 0; n < numthreads; n++)
  {
    for(int c = 0; c < 4; c++)
    {
      picked_color[c] += msum[4 * n + c];
      picked_color_min[c] = fminf(picked_color_min[c], mmin[4 * n + c]);
      picked_color_max[c] = fmaxf(picked_color_max[c], mmax[4 * n + c]);
      weights[c] += cnt[4 * n + c];
    }
  }

  free(cnt);
  free(mmax);
  free(mmin);
  free(msum);

  // and finally normalize data. For bayer, there is twice as much green.
  for(int c = 0; c < 4; c++)
  {
    picked_color[c] = weights[c] ? (picked_color[c] / (float)weights[c]) : 0.0f;
  }
}