Exemplo n.º 1
0
void
dt_gaussian_blur(
    dt_gaussian_t *g,
    float    *in,
    float    *out)
{

  const int width = g->width;
  const int height = g->height;
  const int ch = g->channels;

  float a0, a1, a2, a3, b1, b2, coefp, coefn;

  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);

  float *temp = g->buf;

  float *Labmax = g->max;
  float *Labmin = g->min;

  // vertical blur column by column
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(in,out,temp,Labmin,Labmax,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int i=0; i<width; i++)
  {
    float xp[ch];
    float yb[ch];
    float yp[ch];
    float xc[ch];
    float yc[ch];
    float xn[ch];
    float xa[ch];
    float yn[ch];
    float ya[ch];

    // forward filter
    for(int k=0; k<ch; k++)
    {
      xp[k] = CLAMPF(in[i*ch+k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }
 
    for(int j=0; j<height; j++)
    {
      int offset = (i + j * width)*ch;

      for(int k=0; k<ch; k++)
      {
        xc[k] = CLAMPF(in[offset+k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        temp[offset+k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k=0; k<ch; k++)
    {
      xn[k] = CLAMPF(in[((height - 1) * width + i)*ch+k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int j=height - 1; j > -1; j--)
    {
      int offset = (i + j * width)*ch;

      for(int k=0; k<ch; k++)
      {      
        xc[k] = CLAMPF(in[offset+k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k]; 
        xn[k] = xc[k]; 
        ya[k] = yn[k]; 
        yn[k] = yc[k];

        temp[offset+k] += yc[k];
      }
    }
  }

  // horizontal blur line by line
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(out,temp,Labmin,Labmax,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int j=0; j<height; j++)
  {
    float xp[ch];
    float yb[ch];
    float yp[ch];
    float xc[ch];
    float yc[ch];
    float xn[ch];
    float xa[ch];
    float yn[ch];
    float ya[ch];

    // forward filter
    for(int k=0; k<ch; k++)
    {
      xp[k] = CLAMPF(temp[j*width*ch+k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }
 
    for(int i=0; i<width; i++)
    {
      int offset = (i + j * width)*ch;

      for(int k=0; k<ch; k++)
      {
        xc[k] = CLAMPF(temp[offset+k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        out[offset+k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k=0; k<ch; k++)
    {
      xn[k] = CLAMPF(temp[((j + 1)*width - 1)*ch + k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int i=width - 1; i > -1; i--)
    {
      int offset = (i + j * width)*ch;

      for(int k=0; k<ch; k++)
      {      
        xc[k] = CLAMPF(temp[offset+k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k]; 
        xn[k] = xc[k]; 
        ya[k] = yn[k]; 
        yn[k] = yc[k];

        out[offset+k] += yc[k];
      }
    }
  }
}
Exemplo n.º 2
0
void
dt_gaussian_blur_4c(
    dt_gaussian_t *g,
    float    *in,
    float    *out)
{

  const int width = g->width;
  const int height = g->height;
  const int ch = 4;

  assert(g->channels == 4);

  float a0, a1, a2, a3, b1, b2, coefp, coefn;

  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);

  const __m128 Labmax = _mm_set_ps(g->max[3], g->max[2], g->max[1], g->max[0]);
  const __m128 Labmin = _mm_set_ps(g->min[3], g->min[2], g->min[1], g->min[0]);

  float *temp = g->buf;


  // vertical blur column by column
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(in,out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int i=0; i<width; i++)
  {
    __m128 xp = _mm_setzero_ps();
    __m128 yb = _mm_setzero_ps();
    __m128 yp = _mm_setzero_ps();
    __m128 xc = _mm_setzero_ps();
    __m128 yc = _mm_setzero_ps();
    __m128 xn = _mm_setzero_ps();
    __m128 xa = _mm_setzero_ps();
    __m128 yn = _mm_setzero_ps();
    __m128 ya = _mm_setzero_ps();

    // forward filter
    xp = MMCLAMPPS(_mm_load_ps(in+i*ch), Labmin, Labmax);
    yb = _mm_mul_ps(_mm_set_ps1(coefp), xp);
    yp = yb;

 
    for(int j=0; j<height; j++)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax);


      yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)),
           _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)),
           _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2)))));

      _mm_store_ps(temp+offset, yc);

      xp = xc;
      yb = yp;
      yp = yc;

    }

    // backward filter
    xn = MMCLAMPPS(_mm_load_ps(in+((height - 1) * width + i)*ch), Labmin, Labmax);
    xa = xn;
    yn = _mm_mul_ps(_mm_set_ps1(coefn), xn);
    ya = yn;

    for(int j=height - 1; j > -1; j--)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)),
           _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)),
           _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2)))));


      xa = xn; 
      xn = xc; 
      ya = yn; 
      yn = yc;

      _mm_store_ps(temp+offset, _mm_add_ps(_mm_load_ps(temp+offset), yc));
    }
  }

  // horizontal blur line by line
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int j=0; j<height; j++)
  {
    __m128 xp = _mm_setzero_ps();
    __m128 yb = _mm_setzero_ps();
    __m128 yp = _mm_setzero_ps();
    __m128 xc = _mm_setzero_ps();
    __m128 yc = _mm_setzero_ps();
    __m128 xn = _mm_setzero_ps();
    __m128 xa = _mm_setzero_ps();
    __m128 yn = _mm_setzero_ps();
    __m128 ya = _mm_setzero_ps();

    // forward filter
    xp = MMCLAMPPS(_mm_load_ps(temp+j*width*ch), Labmin, Labmax);
    yb = _mm_mul_ps(_mm_set_ps1(coefp), xp);
    yp = yb;

 
    for(int i=0; i<width; i++)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)),
           _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)),
           _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2)))));

      _mm_store_ps(out+offset, yc);

      xp = xc;
      yb = yp;
      yp = yc;
    }

    // backward filter
    xn = MMCLAMPPS(_mm_load_ps(temp+((j + 1)*width - 1)*ch), Labmin, Labmax);
    xa = xn;
    yn = _mm_mul_ps(_mm_set_ps1(coefn), xn);
    ya = yn;


    for(int i=width - 1; i > -1; i--)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)),
           _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)),
           _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2)))));


      xa = xn; 
      xn = xc; 
      ya = yn; 
      yn = yc;

      _mm_store_ps(out+offset, _mm_add_ps(_mm_load_ps(out+offset), yc));
    }
  }
}
Exemplo n.º 3
0
void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out)
{

  const int width = g->width;
  const int height = g->height;
  const int ch = g->channels;

  float a0, a1, a2, a3, b1, b2, coefp, coefn;

  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);

  float *const temp = g->buf;

  float *const Labmax = g->max;
  float *const Labmin = g->min;

  float *const buf = malloc((size_t)9 * ch * dt_get_num_threads() * sizeof(float));

// vertical blur column by column
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static)
#endif
  for(int i = 0; i < width; i++)
  {
    const int threadnum = dt_get_thread_num();
    float *xp = buf + (size_t)9 * ch * threadnum + 0;
    float *yb = buf + (size_t)9 * ch * threadnum + 1;
    float *yp = buf + (size_t)9 * ch * threadnum + 2;
    float *xc = buf + (size_t)9 * ch * threadnum + 3;
    float *yc = buf + (size_t)9 * ch * threadnum + 4;
    float *xn = buf + (size_t)9 * ch * threadnum + 5;
    float *xa = buf + (size_t)9 * ch * threadnum + 6;
    float *yn = buf + (size_t)9 * ch * threadnum + 7;
    float *ya = buf + (size_t)9 * ch * threadnum + 8;

    // forward filter
    for(int k = 0; k < ch; k++)
    {
      xp[k] = CLAMPF(in[(size_t)i * ch + k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }

    for(int j = 0; j < height; j++)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        temp[offset + k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k = 0; k < ch; k++)
    {
      xn[k] = CLAMPF(in[((size_t)(height - 1) * width + i) * ch + k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int j = height - 1; j > -1; j--)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k];
        xn[k] = xc[k];
        ya[k] = yn[k];
        yn[k] = yc[k];

        temp[offset + k] += yc[k];
      }
    }
  }

// horizontal blur line by line
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static)
#endif
  for(int j = 0; j < height; j++)
  {
    const int threadnum = dt_get_thread_num();
    float *xp = buf + (size_t)9 * ch * threadnum + 0;
    float *yb = buf + (size_t)9 * ch * threadnum + 1;
    float *yp = buf + (size_t)9 * ch * threadnum + 2;
    float *xc = buf + (size_t)9 * ch * threadnum + 3;
    float *yc = buf + (size_t)9 * ch * threadnum + 4;
    float *xn = buf + (size_t)9 * ch * threadnum + 5;
    float *xa = buf + (size_t)9 * ch * threadnum + 6;
    float *yn = buf + (size_t)9 * ch * threadnum + 7;
    float *ya = buf + (size_t)9 * ch * threadnum + 8;


    // forward filter
    for(int k = 0; k < ch; k++)
    {
      xp[k] = CLAMPF(temp[(size_t)j * width * ch + k], Labmin[k], Labmax[k]);
      yb[k] = xp[k] * coefp;
      yp[k] = yb[k];
      xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f;
    }

    for(int i = 0; i < width; i++)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]);
        yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]);

        out[offset + k] = yc[k];

        xp[k] = xc[k];
        yb[k] = yp[k];
        yp[k] = yc[k];
      }
    }

    // backward filter
    for(int k = 0; k < ch; k++)
    {
      xn[k] = CLAMPF(temp[((size_t)(j + 1) * width - 1) * ch + k], Labmin[k], Labmax[k]);
      xa[k] = xn[k];
      yn[k] = xn[k] * coefn;
      ya[k] = yn[k];
    }

    for(int i = width - 1; i > -1; i--)
    {
      size_t offset = ((size_t)j * width + i) * ch;

      for(int k = 0; k < ch; k++)
      {
        xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]);

        yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]);

        xa[k] = xn[k];
        xn[k] = xc[k];
        ya[k] = yn[k];
        yn[k] = yc[k];

        out[offset + k] += yc[k];
      }
    }
  }

  free(buf);
}