void dt_gaussian_blur( dt_gaussian_t *g, float *in, float *out) { const int width = g->width; const int height = g->height; const int ch = g->channels; float a0, a1, a2, a3, b1, b2, coefp, coefn; compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn); float *temp = g->buf; float *Labmax = g->max; float *Labmin = g->min; // vertical blur column by column #ifdef _OPENMP #pragma omp parallel for default(none) shared(in,out,temp,Labmin,Labmax,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int i=0; i<width; i++) { float xp[ch]; float yb[ch]; float yp[ch]; float xc[ch]; float yc[ch]; float xn[ch]; float xa[ch]; float yn[ch]; float ya[ch]; // forward filter for(int k=0; k<ch; k++) { xp[k] = CLAMPF(in[i*ch+k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int j=0; j<height; j++) { int offset = (i + j * width)*ch; for(int k=0; k<ch; k++) { xc[k] = CLAMPF(in[offset+k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); temp[offset+k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k=0; k<ch; k++) { xn[k] = CLAMPF(in[((height - 1) * width + i)*ch+k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int j=height - 1; j > -1; j--) { int offset = (i + j * width)*ch; for(int k=0; k<ch; k++) { xc[k] = CLAMPF(in[offset+k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; temp[offset+k] += yc[k]; } } } // horizontal blur line by line #ifdef _OPENMP #pragma omp parallel for default(none) shared(out,temp,Labmin,Labmax,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int j=0; j<height; j++) { float xp[ch]; float yb[ch]; float yp[ch]; float xc[ch]; float yc[ch]; float xn[ch]; float xa[ch]; float yn[ch]; float ya[ch]; // forward filter for(int k=0; k<ch; k++) { xp[k] = CLAMPF(temp[j*width*ch+k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int i=0; i<width; i++) { int offset = (i + j * width)*ch; for(int k=0; k<ch; k++) { xc[k] = CLAMPF(temp[offset+k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); out[offset+k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k=0; k<ch; k++) { xn[k] = CLAMPF(temp[((j + 1)*width - 1)*ch + k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int i=width - 1; i > -1; i--) { int offset = (i + j * width)*ch; for(int k=0; k<ch; k++) { xc[k] = CLAMPF(temp[offset+k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; out[offset+k] += yc[k]; } } } }
void dt_gaussian_blur_4c( dt_gaussian_t *g, float *in, float *out) { const int width = g->width; const int height = g->height; const int ch = 4; assert(g->channels == 4); float a0, a1, a2, a3, b1, b2, coefp, coefn; compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn); const __m128 Labmax = _mm_set_ps(g->max[3], g->max[2], g->max[1], g->max[0]); const __m128 Labmin = _mm_set_ps(g->min[3], g->min[2], g->min[1], g->min[0]); float *temp = g->buf; // vertical blur column by column #ifdef _OPENMP #pragma omp parallel for default(none) shared(in,out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int i=0; i<width; i++) { __m128 xp = _mm_setzero_ps(); __m128 yb = _mm_setzero_ps(); __m128 yp = _mm_setzero_ps(); __m128 xc = _mm_setzero_ps(); __m128 yc = _mm_setzero_ps(); __m128 xn = _mm_setzero_ps(); __m128 xa = _mm_setzero_ps(); __m128 yn = _mm_setzero_ps(); __m128 ya = _mm_setzero_ps(); // forward filter xp = MMCLAMPPS(_mm_load_ps(in+i*ch), Labmin, Labmax); yb = _mm_mul_ps(_mm_set_ps1(coefp), xp); yp = yb; for(int j=0; j<height; j++) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)), _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)), _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2))))); _mm_store_ps(temp+offset, yc); xp = xc; yb = yp; yp = yc; } // backward filter xn = MMCLAMPPS(_mm_load_ps(in+((height - 1) * width + i)*ch), Labmin, Labmax); xa = xn; yn = _mm_mul_ps(_mm_set_ps1(coefn), xn); ya = yn; for(int j=height - 1; j > -1; j--) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)), _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)), _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2))))); xa = xn; xn = xc; ya = yn; yn = yc; _mm_store_ps(temp+offset, _mm_add_ps(_mm_load_ps(temp+offset), yc)); } } // horizontal blur line by line #ifdef _OPENMP #pragma omp parallel for default(none) shared(out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int j=0; j<height; j++) { __m128 xp = _mm_setzero_ps(); __m128 yb = _mm_setzero_ps(); __m128 yp = _mm_setzero_ps(); __m128 xc = _mm_setzero_ps(); __m128 yc = _mm_setzero_ps(); __m128 xn = _mm_setzero_ps(); __m128 xa = _mm_setzero_ps(); __m128 yn = _mm_setzero_ps(); __m128 ya = _mm_setzero_ps(); // forward filter xp = MMCLAMPPS(_mm_load_ps(temp+j*width*ch), Labmin, Labmax); yb = _mm_mul_ps(_mm_set_ps1(coefp), xp); yp = yb; for(int i=0; i<width; i++) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)), _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)), _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2))))); _mm_store_ps(out+offset, yc); xp = xc; yb = yp; yp = yc; } // backward filter xn = MMCLAMPPS(_mm_load_ps(temp+((j + 1)*width - 1)*ch), Labmin, Labmax); xa = xn; yn = _mm_mul_ps(_mm_set_ps1(coefn), xn); ya = yn; for(int i=width - 1; i > -1; i--) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)), _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)), _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2))))); xa = xn; xn = xc; ya = yn; yn = yc; _mm_store_ps(out+offset, _mm_add_ps(_mm_load_ps(out+offset), yc)); } } }
void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out) { const int width = g->width; const int height = g->height; const int ch = g->channels; float a0, a1, a2, a3, b1, b2, coefp, coefn; compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn); float *const temp = g->buf; float *const Labmax = g->max; float *const Labmin = g->min; float *const buf = malloc((size_t)9 * ch * dt_get_num_threads() * sizeof(float)); // vertical blur column by column #ifdef _OPENMP #pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static) #endif for(int i = 0; i < width; i++) { const int threadnum = dt_get_thread_num(); float *xp = buf + (size_t)9 * ch * threadnum + 0; float *yb = buf + (size_t)9 * ch * threadnum + 1; float *yp = buf + (size_t)9 * ch * threadnum + 2; float *xc = buf + (size_t)9 * ch * threadnum + 3; float *yc = buf + (size_t)9 * ch * threadnum + 4; float *xn = buf + (size_t)9 * ch * threadnum + 5; float *xa = buf + (size_t)9 * ch * threadnum + 6; float *yn = buf + (size_t)9 * ch * threadnum + 7; float *ya = buf + (size_t)9 * ch * threadnum + 8; // forward filter for(int k = 0; k < ch; k++) { xp[k] = CLAMPF(in[(size_t)i * ch + k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int j = 0; j < height; j++) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); temp[offset + k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k = 0; k < ch; k++) { xn[k] = CLAMPF(in[((size_t)(height - 1) * width + i) * ch + k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int j = height - 1; j > -1; j--) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(in[offset + k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; temp[offset + k] += yc[k]; } } } // horizontal blur line by line #ifdef _OPENMP #pragma omp parallel for default(none) shared(a0, a1, a2, a3, b1, b2, coefp, coefn) schedule(static) #endif for(int j = 0; j < height; j++) { const int threadnum = dt_get_thread_num(); float *xp = buf + (size_t)9 * ch * threadnum + 0; float *yb = buf + (size_t)9 * ch * threadnum + 1; float *yp = buf + (size_t)9 * ch * threadnum + 2; float *xc = buf + (size_t)9 * ch * threadnum + 3; float *yc = buf + (size_t)9 * ch * threadnum + 4; float *xn = buf + (size_t)9 * ch * threadnum + 5; float *xa = buf + (size_t)9 * ch * threadnum + 6; float *yn = buf + (size_t)9 * ch * threadnum + 7; float *ya = buf + (size_t)9 * ch * threadnum + 8; // forward filter for(int k = 0; k < ch; k++) { xp[k] = CLAMPF(temp[(size_t)j * width * ch + k], Labmin[k], Labmax[k]); yb[k] = xp[k] * coefp; yp[k] = yb[k]; xc[k] = yc[k] = xn[k] = xa[k] = yn[k] = ya[k] = 0.0f; } for(int i = 0; i < width; i++) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]); yc[k] = (a0 * xc[k]) + (a1 * xp[k]) - (b1 * yp[k]) - (b2 * yb[k]); out[offset + k] = yc[k]; xp[k] = xc[k]; yb[k] = yp[k]; yp[k] = yc[k]; } } // backward filter for(int k = 0; k < ch; k++) { xn[k] = CLAMPF(temp[((size_t)(j + 1) * width - 1) * ch + k], Labmin[k], Labmax[k]); xa[k] = xn[k]; yn[k] = xn[k] * coefn; ya[k] = yn[k]; } for(int i = width - 1; i > -1; i--) { size_t offset = ((size_t)j * width + i) * ch; for(int k = 0; k < ch; k++) { xc[k] = CLAMPF(temp[offset + k], Labmin[k], Labmax[k]); yc[k] = (a2 * xn[k]) + (a3 * xa[k]) - (b1 * yn[k]) - (b2 * ya[k]); xa[k] = xn[k]; xn[k] = xc[k]; ya[k] = yn[k]; yn[k] = yc[k]; out[offset + k] += yc[k]; } } } free(buf); }