lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else { int c1 = FC(rowx+1,left+2), c2 = FC(rowx,left+1); pix = (union rgbpix*)image + row*width+left; rix = &rgb[row-top][0]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col; union hvrgbpix rix0, rixr; rix = &rgb[rowx-top][col-left]; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ur = pix[-width+1].c[c1]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; signed pix_lr = pix[0].c[c2] + pix[2].c[c2]; rixr.h.g = rixr.v.g = pix[1].g; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1]+1;
void ahd_interpolate_tile(int top, char * buffer) { int row, col, tr, tc, c, val; const int dir[4] = { -1, 1, -width, width }; __m128i ldiff[2], abdiff[2]; union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer; union hvrgbpix *rix; union rgbpix * pix; union hvrgbpix (*lab)[width]; short (*lix)[8]; char (*h**o)[width][2]; lab = (union hvrgbpix (*)[width])(buffer + 16*width*TS); h**o = (char (*)[width][2])(buffer + 32*width*TS); const int left=2; if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) { fprintf(stderr, "unaligned buffers defeat speed!\n"); abort(); } /* Interpolate gren horz&vert, red and blue, and convert to CIELab: */ //do the first two rows of green first. //then one green, and rgb through the tile.. this because R/B needs down-right green value for (row=top; row < top+2 && row < height-2; row++) { col = left + (FC(row,left) & 1); for (c = FC(row,col); col < width-2; col+=2) { pix = (union rgbpix*)image + row*width+col; val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2; rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2; rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g); } } for (; row < top+TS && row < height-2; row++) { int rowx = row-1; if (FC(rowx,left+1)==1) { int c1 = FC(rowx+1,left+1), c2 = FC(rowx,left+2); pix = (union rgbpix*)image + row*width+left+1; rix = &rgb[row-top][1]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col+1; union hvrgbpix rixr, rix0; rix = &rgb[rowx-top][col-left]+1; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ul = pix[-width-1].c[c1]; rixr.vec = _mm_set1_epi16(pix[-1].g); signed pix_lr = pix[-2].c[c2] + pix[0].c[c2]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1; signed pix_dl = pix[width-1].c[c1]; //fully loaded __m128i rix_dr = _mm_setr_epi32(pix[width].g, pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1], pix[width+3].c[c1], pix[width+1].c[c1], 0)); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g, 0, pix[2*width+1].g, pix[3*width+1].c[c1])); rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1)); //half loaded rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128()); rix_dr = _mm_srai_epi32(rix_dr,2); __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0); __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0); __m128i m = _mm_min_epi32(a,b); __m128i M = _mm_max_epi32(a,b); rix_dr = _mm_min_epi32(rix_dr,M); rix_dr = _mm_max_epi32(rix_dr,m); signed pix_udr = pix_ul + pix_dl; signed rix0_ul = rix[-width-1].h.g; signed rix1_ul = rix[-width-1].v.g; __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0); signed rix0_rr = rix[-2].h.g; signed rix1_rr = rix[-2].v.g; rix0.h.g = rix[0].h.g; rix0.v.g = rix[0].v.g; signed rix0_dl = rix[width-1].h.g; signed rix1_dl = rix[width-1].v.g; // fully loaded __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr); rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g)); __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr); v2 = _mm_sub_epi32(v2, rix_udr); v2 = _mm_srai_epi32(v2,1); v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec)); v2 = _mm_max_epi32(v2, _mm_setzero_si128()); v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff)); rixr.h.c[c2] = _mm_extract_epi32(v2,2); rixr.v.c[c2] = _mm_extract_epi32(v2,3); rixr.h.c[c1] = _mm_extract_epi32(v2,0); rixr.v.c[c1] = _mm_extract_epi32(v2,1); // following only uses 64 bit __m128i v1 = _mm_set1_epi32(pix_diag); v1 = _mm_sub_epi32(v1, rix_ur); v1 = _mm_sub_epi32(v1, rix_dr); v1 = _mm_sub_epi32(v1, rix_udr); v1 = _mm_srai_epi32(v1,2); v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0)); v1 = _mm_max_epi32(v1, _mm_setzero_si128()); v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff)); rix0.h.c[c1] = _mm_extract_epi32(v1,0); rix0.v.c[c1] = _mm_extract_epi32(v1,1); lab[rowx-top][col-left].vec = cielabv(rixr); lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else {
void CLASS lmmse_interpolate(int gamma_apply) { ushort (*pix)[4]; int row, col, c, d, w1, w2, w3, w4, ii, ba, rr1, cc1, rr, cc, pass; float h0, h1, h2, h3, h4, hs; float p1, p2, p3, p4, p5, p6, p7, p8, p9, temp; float Y, v0, mu, vx, vn, xh, vh, xv, vv; float (*rix)[6], (*qix)[6]; float (*glut); char *buffer; clock_t t1, t2; double dt; #ifdef DCRAW_VERBOSE if (verbose) fprintf(stderr,_("LMMSE interpolation...\n")); #endif t1 = clock(); // allocate work with boundary ba = 10; rr1 = height + 2*ba; cc1 = width + 2*ba; if (gamma_apply) buffer = (char *)calloc(rr1*cc1*6*sizeof(float)+65536*sizeof(float),1); else buffer = (char *)calloc(rr1*cc1*6*sizeof(float),1); merror(buffer,"lmmse_interpolate()"); qix = (float (*)[6])buffer; if (gamma_apply) { glut = (float *)(buffer + rr1*cc1*24); for (ii=0; ii < 65536; ii++) { v0 = (float)ii / 65535.0; if (v0 <= 0.0031308) glut[ii] = v0*12.92; else glut[ii] = 1.055*pow((double)v0,1./2.4) - 0.055; } } // indices w1 = cc1; w2 = 2*w1; w3 = 3*w1; w4 = 4*w1; // define low pass filter (sigma=2, L=4) h0 = 1.0; h1 = exp( -1.0/8.0); h2 = exp( -4.0/8.0); h3 = exp( -9.0/8.0); h4 = exp(-16.0/8.0); hs = h0 + 2.0*(h1 + h2 + h3 + h4); h0 /= hs; h1 /= hs; h2 /= hs; h3 /= hs; h4 /= hs; // copy CFA values for (rr=0; rr < rr1; rr++) for (cc=0, row=rr-ba; cc < cc1; cc++) { col = cc - ba; rix = qix + rr*cc1 + cc; if ((row >= 0) & (row < height) & (col >= 0) & (col < width)) if (gamma_apply) rix[0][4] = glut[image[row*width+col][FC(row,col)]]; else rix[0][4] = (double)image[row*width+col][FC(row,col)]/65535.0; else rix[0][4] = 0; } // G-R(B) for (rr=2; rr < rr1-2; rr++) { // G-R(B) at R(B) location for (cc=2+(FC(rr,2)&1); cc < cc1-2; cc+=2) { rix = qix + rr*cc1 + cc; // v0 = 0.25R + 0.25B, Y = 0.25R + 0.5B + 0.25B v0 = 0.0625*(rix[-w1-1][4]+rix[-w1+1][4]+rix[w1-1][4]+rix[w1+1][4]) + 0.25*rix[0][4]; // horizontal rix[0][0] = -0.25*(rix[ -2][4] + rix[ 2][4]) + 0.5*(rix[ -1][4] + rix[0][4] + rix[ 1][4]); Y = v0 + 0.5*rix[0][0]; if (rix[0][4] > 1.75*Y) rix[0][0] = ULIM(rix[0][0],rix[ -1][4],rix[ 1][4]); else rix[0][0] = LIM(rix[0][0],0.0,1.0); rix[0][0] -= rix[0][4]; // vertical rix[0][1] = -0.25*(rix[-w2][4] + rix[w2][4]) + 0.5*(rix[-w1][4] + rix[0][4] + rix[w1][4]); Y = v0 + 0.5*rix[0][1]; if (rix[0][4] > 1.75*Y) rix[0][1] = ULIM(rix[0][1],rix[-w1][4],rix[w1][4]); else rix[0][1] = LIM(rix[0][1],0.0,1.0); rix[0][1] -= rix[0][4]; } // G-R(B) at G location for (cc=2+(FC(rr,3)&1); cc < cc1-2; cc+=2) { rix = qix + rr*cc1 + cc; rix[0][0] = 0.25*(rix[ -2][4] + rix[ 2][4]) - 0.5*(rix[ -1][4] + rix[0][4] + rix[ 1][4]); rix[0][1] = 0.25*(rix[-w2][4] + rix[w2][4]) - 0.5*(rix[-w1][4] + rix[0][4] + rix[w1][4]); rix[0][0] = LIM(rix[0][0],-1.0,0.0) + rix[0][4]; rix[0][1] = LIM(rix[0][1],-1.0,0.0) + rix[0][4]; } } // apply low pass filter on differential colors for (rr=4; rr < rr1-4; rr++) for (cc=4; cc < cc1-4; cc++) { rix = qix + rr*cc1 + cc; rix[0][2] = h0*rix[0][0] + h1*(rix[ -1][0] + rix[ 1][0]) + h2*(rix[ -2][0] + rix[ 2][0]) + h3*(rix[ -3][0] + rix[ 3][0]) + h4*(rix[ -4][0] + rix[ 4][0]); rix[0][3] = h0*rix[0][1] + h1*(rix[-w1][1] + rix[w1][1]) + h2*(rix[-w2][1] + rix[w2][1]) + h3*(rix[-w3][1] + rix[w3][1]) + h4*(rix[-w4][1] + rix[w4][1]); } // interpolate G-R(B) at R(B) for (rr=4; rr < rr1-4; rr++) for (cc=4+(FC(rr,4)&1); cc < cc1-4; cc+=2) { rix = qix + rr*cc1 + cc; // horizontal mu = (rix[-4][2] + rix[-3][2] + rix[-2][2] + rix[-1][2] + rix[0][2]+ rix[ 1][2] + rix[ 2][2] + rix[ 3][2] + rix[ 4][2]) / 9.0; p1 = rix[-4][2] - mu; p2 = rix[-3][2] - mu; p3 = rix[-2][2] - mu; p4 = rix[-1][2] - mu; p5 = rix[ 0][2] - mu; p6 = rix[ 1][2] - mu; p7 = rix[ 2][2] - mu; p8 = rix[ 3][2] - mu; p9 = rix[ 4][2] - mu; vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; p1 = rix[-4][0] - rix[-4][2]; p2 = rix[-3][0] - rix[-3][2]; p3 = rix[-2][0] - rix[-2][2]; p4 = rix[-1][0] - rix[-1][2]; p5 = rix[ 0][0] - rix[ 0][2]; p6 = rix[ 1][0] - rix[ 1][2]; p7 = rix[ 2][0] - rix[ 2][2]; p8 = rix[ 3][0] - rix[ 3][2]; p9 = rix[ 4][0] - rix[ 4][2]; vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; xh = (rix[0][0]*vx + rix[0][2]*vn)/(vx + vn); vh = vx*vn/(vx + vn); // vertical mu = (rix[-w4][3] + rix[-w3][3] + rix[-w2][3] + rix[-w1][3] + rix[0][3]+ rix[ w1][3] + rix[ w2][3] + rix[ w3][3] + rix[ w4][3]) / 9.0; p1 = rix[-w4][3] - mu; p2 = rix[-w3][3] - mu; p3 = rix[-w2][3] - mu; p4 = rix[-w1][3] - mu; p5 = rix[ 0][3] - mu; p6 = rix[ w1][3] - mu; p7 = rix[ w2][3] - mu; p8 = rix[ w3][3] - mu; p9 = rix[ w4][3] - mu; vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; p1 = rix[-w4][1] - rix[-w4][3]; p2 = rix[-w3][1] - rix[-w3][3]; p3 = rix[-w2][1] - rix[-w2][3]; p4 = rix[-w1][1] - rix[-w1][3]; p5 = rix[ 0][1] - rix[ 0][3]; p6 = rix[ w1][1] - rix[ w1][3]; p7 = rix[ w2][1] - rix[ w2][3]; p8 = rix[ w3][1] - rix[ w3][3]; p9 = rix[ w4][1] - rix[ w4][3]; vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9; xv = (rix[0][1]*vx + rix[0][3]*vn)/(vx + vn); vv = vx*vn/(vx + vn); // interpolated G-R(B) rix[0][4] = (xh*vv + xv*vh)/(vh + vv); } // copy CFA values for (rr=0; rr < rr1; rr++) for (cc=0, row=rr-ba; cc < cc1; cc++) { col=cc-ba; rix = qix + rr*cc1 + cc; c = FC(rr,cc); if ((row >= 0) & (row < height) & (col >= 0) & (col < width)) if (gamma_apply) rix[0][c] = glut[image[row*width+col][c]]; else rix[0][c] = (double)image[row*width+col][c]/65535.0; else rix[0][c] = 0; if (c != 1) rix[0][1] = rix[0][c] + rix[0][4]; } // bilinear interpolation for R/B // interpolate R/B at G location for (rr=1; rr < rr1-1; rr++) for (cc=1+(FC(rr,2)&1), c=FC(rr,cc+1); cc < cc1-1; cc+=2) { rix = qix + rr*cc1 + cc; rix[0][c] = rix[0][1] + 0.5*(rix[ -1][c] - rix[ -1][1] + rix[ 1][c] - rix[ 1][1]); c = 2 - c; rix[0][c] = rix[0][1] + 0.5*(rix[-w1][c] - rix[-w1][1] + rix[w1][c] - rix[w1][1]); c = 2 - c; } // interpolate R/B at B/R location for (rr=1; rr < rr1-1; rr++) for (cc=1+(FC(rr,1)&1), c=2-FC(rr,cc); cc < cc1-1; cc+=2) { rix = qix + rr*cc1 + cc; rix[0][c] = rix[0][1] + 0.25*(rix[-w1][c] - rix[-w1][1] + rix[ -1][c] - rix[ -1][1]+ rix[ 1][c] - rix[ 1][1] + rix[ w1][c] - rix[ w1][1]); } // median filter for (pass=1; pass <= 3; pass++) { for (c=0; c < 3; c+=2) { // Compute median(R-G) and median(B-G) d = c + 3; for (ii=0; ii < rr1*cc1; ii++) qix[ii][d] = qix[ii][c] - qix[ii][1]; // Apply 3x3 median fileter for (rr=1; rr < rr1-1; rr++) for (cc=1; cc < cc1-1; cc++) { rix = qix + rr*cc1 + cc; // Assign 3x3 differential color values p1 = rix[-w1-1][d]; p2 = rix[-w1][d]; p3 = rix[-w1+1][d]; p4 = rix[ -1][d]; p5 = rix[ 0][d]; p6 = rix[ 1][d]; p7 = rix[ w1-1][d]; p8 = rix[ w1][d]; p9 = rix[ w1+1][d]; // Sort for median of 9 values PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9); PIX_SORT(p1,p2); PIX_SORT(p4,p5); PIX_SORT(p7,p8); PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9); PIX_SORT(p1,p4); PIX_SORT(p6,p9); PIX_SORT(p5,p8); PIX_SORT(p4,p7); PIX_SORT(p2,p5); PIX_SORT(p3,p6); PIX_SORT(p5,p8); PIX_SORT(p5,p3); PIX_SORT(p7,p5); PIX_SORT(p5,p3); rix[0][4] = p5; } for (ii=0; ii < rr1*cc1; ii++) qix[ii][d] = qix[ii][4]; } // red/blue at GREEN pixel locations for (rr=0; rr < rr1; rr++) for (cc=(FC(rr,1)&1), c=FC(rr,cc+1); cc < cc1; cc+=2) { rix = qix + rr*cc1 + cc; rix[0][0] = rix[0][1] + rix[0][3]; rix[0][2] = rix[0][1] + rix[0][5]; } // red/blue and green at BLUE/RED pixel locations for (rr=0; rr < rr1; rr++) for (cc=(FC(rr,0)&1), c=2-FC(rr,cc), d=c+3; cc < cc1; cc+=2) { rix = qix + rr*cc1 + cc; rix[0][c] = rix[0][1] + rix[0][d]; rix[0][1] = 0.5*(rix[0][0] - rix[0][3] + rix[0][2] - rix[0][5]); } } // copy result back to image matrix for (row=0; row < height; row++) for (col=0, rr=row+ba; col < width; col++) { cc = col+ba; pix = image + row*width + col; rix = qix + rr*cc1 + cc; c = FC(row,col); if (gamma_apply) { for (ii=0; ii < 3; ii++) if (ii != c) { v0 = rix[0][ii]; if (v0 <= 0.04045) v0 /= 12.92; else v0 = pow((v0 + 0.055)/1.055,2.4); pix[0][ii] = CLIP((int)(65535.0*v0 + 0.5)); } } else for (ii=0; ii < 3; ii++) if (ii != c) pix[0][ii] = CLIP((int)(65535.0*rix[0][ii] + 0.5)); } // Done free(buffer); t2 = clock(); dt = ((double)(t2-t1)) / CLOCKS_PER_SEC; #ifdef DCRAW_VERBOSE if (verbose) fprintf(stderr,_("\telapsed time = %5.3fs\n"),dt); #endif }