lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {
            int c1 = FC(rowx+1,left+2),
                c2 = FC(rowx,left+1);

            pix = (union rgbpix*)image + row*width+left;
            rix = &rgb[row-top][0];
            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col;

                union hvrgbpix rix0, rixr;

                rix = &rgb[rowx-top][col-left];

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ur = pix[-width+1].c[c1];
                rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2];
                signed pix_lr = pix[0].c[c2] + pix[2].c[c2];
                rixr.h.g = rixr.v.g = pix[1].g;
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1]+1;
void ahd_interpolate_tile(int top, char * buffer)
{
    int row, col, tr, tc, c, val;
    const int dir[4] = { -1, 1, -width, width };
    __m128i ldiff[2], abdiff[2];
    union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer;
    union hvrgbpix *rix;
    union rgbpix * pix;
    union hvrgbpix (*lab)[width];
    short (*lix)[8];
    char (*h**o)[width][2];
    lab  = (union hvrgbpix (*)[width])(buffer + 16*width*TS);
    h**o = (char  (*)[width][2])(buffer + 32*width*TS);

    const int left=2;

    if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) {
        fprintf(stderr, "unaligned buffers defeat speed!\n"); abort();
    }

    /*  Interpolate gren horz&vert, red and blue, and convert to CIELab:  */
    //do the first two rows of green first.
    //then one green, and rgb through the tile.. this because R/B needs down-right green value
    for (row=top; row < top+2 && row < height-2; row++) {
        col = left + (FC(row,left) & 1);
        for (c = FC(row,col); col < width-2; col+=2) {
            pix = (union rgbpix*)image + row*width+col;
            val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2;
            rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2;
            rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g);
        }
    }

    for (; row < top+TS && row < height-2; row++) {
        int rowx = row-1;

        if (FC(rowx,left+1)==1) {
            int c1 = FC(rowx+1,left+1),
                c2 = FC(rowx,left+2);

            pix = (union rgbpix*)image + row*width+left+1;
            rix = &rgb[row-top][1];

            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col+1;

                union hvrgbpix rixr, rix0;

                rix = &rgb[rowx-top][col-left]+1;

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ul = pix[-width-1].c[c1];
                rixr.vec = _mm_set1_epi16(pix[-1].g);
                signed pix_lr = pix[-2].c[c2] + pix[0].c[c2];
                rix0.h.c[c2] = rix0.v.c[c2]  = pix[0].c[c2];
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1;
                signed pix_dl = pix[width-1].c[c1];

                //fully loaded
                __m128i rix_dr =               _mm_setr_epi32(pix[width].g,       pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]);
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1],  pix[width+3].c[c1], pix[width+1].c[c1], 0));
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g,      0,                  pix[2*width+1].g, pix[3*width+1].c[c1]));
                rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1));
                //half loaded
                rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128());
                rix_dr = _mm_srai_epi32(rix_dr,2);
                __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0);
                __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0);
                __m128i m = _mm_min_epi32(a,b);
                __m128i M = _mm_max_epi32(a,b);
                rix_dr = _mm_min_epi32(rix_dr,M);
                rix_dr = _mm_max_epi32(rix_dr,m);

                signed pix_udr = pix_ul + pix_dl;

                signed rix0_ul = rix[-width-1].h.g;
                signed rix1_ul = rix[-width-1].v.g;
                __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0);
                signed rix0_rr = rix[-2].h.g;
                signed rix1_rr = rix[-2].v.g;

                rix0.h.g = rix[0].h.g;
                rix0.v.g = rix[0].v.g;
                signed rix0_dl = rix[width-1].h.g;
                signed rix1_dl = rix[width-1].v.g;

                // fully loaded
                __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr);
                rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g));
                __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr);
                v2 = _mm_sub_epi32(v2, rix_udr);
                v2 = _mm_srai_epi32(v2,1);
                v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec));
                v2 = _mm_max_epi32(v2, _mm_setzero_si128());
                v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff));
                rixr.h.c[c2] = _mm_extract_epi32(v2,2);
                rixr.v.c[c2] = _mm_extract_epi32(v2,3);
                rixr.h.c[c1] = _mm_extract_epi32(v2,0);
                rixr.v.c[c1] = _mm_extract_epi32(v2,1);

                // following only uses 64 bit
                __m128i v1 = _mm_set1_epi32(pix_diag);
                v1 = _mm_sub_epi32(v1, rix_ur);
                v1 = _mm_sub_epi32(v1, rix_dr);
                v1 = _mm_sub_epi32(v1, rix_udr);
                v1 = _mm_srai_epi32(v1,2);
                v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0));
                v1 = _mm_max_epi32(v1, _mm_setzero_si128());
                v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff));
                rix0.h.c[c1] = _mm_extract_epi32(v1,0);
                rix0.v.c[c1] = _mm_extract_epi32(v1,1);


                lab[rowx-top][col-left].vec = cielabv(rixr);
                lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {
Example #3
0
void CLASS lmmse_interpolate(int gamma_apply)
{
  ushort (*pix)[4];
  int row, col, c, d, w1, w2, w3, w4, ii, ba, rr1, cc1, rr, cc, pass;
  float h0, h1, h2, h3, h4, hs;
  float p1, p2, p3, p4, p5, p6, p7, p8, p9, temp;
  float Y, v0, mu, vx, vn, xh, vh, xv, vv;
  float (*rix)[6], (*qix)[6];
  float (*glut);
  char  *buffer;
  clock_t t1, t2;
  double dt;
#ifdef DCRAW_VERBOSE
  if (verbose) fprintf(stderr,_("LMMSE interpolation...\n"));
#endif
  t1 = clock();
  // allocate work with boundary
  ba = 10;
  rr1 = height + 2*ba;
  cc1 = width + 2*ba;
  if (gamma_apply)
    buffer = (char *)calloc(rr1*cc1*6*sizeof(float)+65536*sizeof(float),1);
  else
    buffer = (char *)calloc(rr1*cc1*6*sizeof(float),1);
  merror(buffer,"lmmse_interpolate()");
  qix = (float (*)[6])buffer;
  if (gamma_apply) {
    glut = (float *)(buffer + rr1*cc1*24);
    for (ii=0; ii < 65536; ii++) {
      v0 = (float)ii / 65535.0;
      if (v0 <= 0.0031308)
	glut[ii] = v0*12.92;
      else
	glut[ii] = 1.055*pow((double)v0,1./2.4) - 0.055; } }
  // indices
  w1 = cc1;
  w2 = 2*w1;
  w3 = 3*w1;
  w4 = 4*w1;
  // define low pass filter (sigma=2, L=4)
  h0 = 1.0;
  h1 = exp( -1.0/8.0);
  h2 = exp( -4.0/8.0);
  h3 = exp( -9.0/8.0);
  h4 = exp(-16.0/8.0);
  hs = h0 + 2.0*(h1 + h2 + h3 + h4);
  h0 /= hs;
  h1 /= hs;
  h2 /= hs;
  h3 /= hs;
  h4 /= hs;
  // copy CFA values
  for (rr=0; rr < rr1; rr++)
    for (cc=0, row=rr-ba; cc < cc1; cc++) {
      col = cc - ba;
      rix = qix + rr*cc1 + cc;
      if ((row >= 0) & (row < height) & (col >= 0) & (col < width))
	if (gamma_apply)
	  rix[0][4] = glut[image[row*width+col][FC(row,col)]];
	else
	  rix[0][4] = (double)image[row*width+col][FC(row,col)]/65535.0;
      else
	rix[0][4] = 0; }
  // G-R(B)
  for (rr=2; rr < rr1-2; rr++) {
    // G-R(B) at R(B) location
    for (cc=2+(FC(rr,2)&1); cc < cc1-2; cc+=2) {
      rix = qix + rr*cc1 + cc;
      // v0 = 0.25R + 0.25B, Y = 0.25R + 0.5B + 0.25B
      v0 = 0.0625*(rix[-w1-1][4]+rix[-w1+1][4]+rix[w1-1][4]+rix[w1+1][4]) +
	0.25*rix[0][4];
      // horizontal
      rix[0][0] = -0.25*(rix[ -2][4] + rix[ 2][4])
	+ 0.5*(rix[ -1][4] + rix[0][4] + rix[ 1][4]);
      Y = v0 + 0.5*rix[0][0];
      if (rix[0][4] > 1.75*Y)
	rix[0][0] = ULIM(rix[0][0],rix[ -1][4],rix[ 1][4]);
      else
	rix[0][0] = LIM(rix[0][0],0.0,1.0);
      rix[0][0] -= rix[0][4];
      // vertical
      rix[0][1] = -0.25*(rix[-w2][4] + rix[w2][4])
	+ 0.5*(rix[-w1][4] + rix[0][4] + rix[w1][4]);
      Y = v0 + 0.5*rix[0][1];
      if (rix[0][4] > 1.75*Y)
	rix[0][1] = ULIM(rix[0][1],rix[-w1][4],rix[w1][4]);
      else
	rix[0][1] = LIM(rix[0][1],0.0,1.0);
      rix[0][1] -= rix[0][4]; }
    // G-R(B) at G location
    for (cc=2+(FC(rr,3)&1); cc < cc1-2; cc+=2) {
      rix = qix + rr*cc1 + cc;
      rix[0][0] = 0.25*(rix[ -2][4] + rix[ 2][4])
	- 0.5*(rix[ -1][4] + rix[0][4] + rix[ 1][4]);
      rix[0][1] = 0.25*(rix[-w2][4] + rix[w2][4])
	- 0.5*(rix[-w1][4] + rix[0][4] + rix[w1][4]);
      rix[0][0] = LIM(rix[0][0],-1.0,0.0) + rix[0][4];
      rix[0][1] = LIM(rix[0][1],-1.0,0.0) + rix[0][4];
    } }
  // apply low pass filter on differential colors
  for (rr=4; rr < rr1-4; rr++)
    for (cc=4; cc < cc1-4; cc++) {
      rix = qix + rr*cc1 + cc;
      rix[0][2] = h0*rix[0][0] +
	h1*(rix[ -1][0] + rix[ 1][0]) + h2*(rix[ -2][0] + rix[ 2][0]) +
	h3*(rix[ -3][0] + rix[ 3][0]) + h4*(rix[ -4][0] + rix[ 4][0]);
      rix[0][3] = h0*rix[0][1] +
	h1*(rix[-w1][1] + rix[w1][1]) + h2*(rix[-w2][1] + rix[w2][1]) +
	h3*(rix[-w3][1] + rix[w3][1]) + h4*(rix[-w4][1] + rix[w4][1]); }
  // interpolate G-R(B) at R(B)
  for (rr=4; rr < rr1-4; rr++)
    for (cc=4+(FC(rr,4)&1); cc < cc1-4; cc+=2) {
      rix = qix + rr*cc1 + cc;
      // horizontal
      mu = (rix[-4][2] + rix[-3][2] + rix[-2][2] + rix[-1][2] + rix[0][2]+
	    rix[ 1][2] + rix[ 2][2] + rix[ 3][2] + rix[ 4][2]) / 9.0;
      p1 = rix[-4][2] - mu;
      p2 = rix[-3][2] - mu;
      p3 = rix[-2][2] - mu;
      p4 = rix[-1][2] - mu;
      p5 = rix[ 0][2] - mu;
      p6 = rix[ 1][2] - mu;
      p7 = rix[ 2][2] - mu;
      p8 = rix[ 3][2] - mu;
      p9 = rix[ 4][2] - mu;
      vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
      p1 = rix[-4][0] - rix[-4][2];
      p2 = rix[-3][0] - rix[-3][2];
      p3 = rix[-2][0] - rix[-2][2];
      p4 = rix[-1][0] - rix[-1][2];
      p5 = rix[ 0][0] - rix[ 0][2];
      p6 = rix[ 1][0] - rix[ 1][2];
      p7 = rix[ 2][0] - rix[ 2][2];
      p8 = rix[ 3][0] - rix[ 3][2];
      p9 = rix[ 4][0] - rix[ 4][2];
      vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
      xh = (rix[0][0]*vx + rix[0][2]*vn)/(vx + vn);
      vh = vx*vn/(vx + vn);
      // vertical
      mu = (rix[-w4][3] + rix[-w3][3] + rix[-w2][3] + rix[-w1][3] + rix[0][3]+
	    rix[ w1][3] + rix[ w2][3] + rix[ w3][3] + rix[ w4][3]) / 9.0;
      p1 = rix[-w4][3] - mu;
      p2 = rix[-w3][3] - mu;
      p3 = rix[-w2][3] - mu;
      p4 = rix[-w1][3] - mu;
      p5 = rix[  0][3] - mu;
      p6 = rix[ w1][3] - mu;
      p7 = rix[ w2][3] - mu;
      p8 = rix[ w3][3] - mu;
      p9 = rix[ w4][3] - mu;
      vx = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
      p1 = rix[-w4][1] - rix[-w4][3];
      p2 = rix[-w3][1] - rix[-w3][3];
      p3 = rix[-w2][1] - rix[-w2][3];
      p4 = rix[-w1][1] - rix[-w1][3];
      p5 = rix[  0][1] - rix[  0][3];
      p6 = rix[ w1][1] - rix[ w1][3];
      p7 = rix[ w2][1] - rix[ w2][3];
      p8 = rix[ w3][1] - rix[ w3][3];
      p9 = rix[ w4][1] - rix[ w4][3];
      vn = 1e-7+p1*p1+p2*p2+p3*p3+p4*p4+p5*p5+p6*p6+p7*p7+p8*p8+p9*p9;
      xv = (rix[0][1]*vx + rix[0][3]*vn)/(vx + vn);
      vv = vx*vn/(vx + vn);
      // interpolated G-R(B)
      rix[0][4] = (xh*vv + xv*vh)/(vh + vv); }
  // copy CFA values
  for (rr=0; rr < rr1; rr++)
    for (cc=0, row=rr-ba; cc < cc1; cc++) {
      col=cc-ba;
      rix = qix + rr*cc1 + cc;
      c = FC(rr,cc);
      if ((row >= 0) & (row < height) & (col >= 0) & (col < width))
	if (gamma_apply)
	  rix[0][c] = glut[image[row*width+col][c]];
	else
	  rix[0][c] = (double)image[row*width+col][c]/65535.0;
      else
	rix[0][c] = 0;
      if (c != 1) rix[0][1] = rix[0][c] + rix[0][4]; }
  // bilinear interpolation for R/B
  // interpolate R/B at G location
  for (rr=1; rr < rr1-1; rr++)
    for (cc=1+(FC(rr,2)&1), c=FC(rr,cc+1); cc < cc1-1; cc+=2) {
      rix = qix + rr*cc1 + cc;
      rix[0][c] = rix[0][1]
	+ 0.5*(rix[ -1][c] - rix[ -1][1] + rix[ 1][c] - rix[ 1][1]);
      c = 2 - c;
      rix[0][c] = rix[0][1]
	+ 0.5*(rix[-w1][c] - rix[-w1][1] + rix[w1][c] - rix[w1][1]);
      c = 2 - c; }
  // interpolate R/B at B/R location
  for (rr=1; rr < rr1-1; rr++)
    for (cc=1+(FC(rr,1)&1), c=2-FC(rr,cc); cc < cc1-1; cc+=2) {
      rix = qix + rr*cc1 + cc;
      rix[0][c] = rix[0][1]
	+ 0.25*(rix[-w1][c] - rix[-w1][1] + rix[ -1][c] - rix[ -1][1]+
		rix[  1][c] - rix[  1][1] + rix[ w1][c] - rix[ w1][1]); }
  // median filter
  for (pass=1; pass <= 3; pass++) {
    for (c=0; c < 3; c+=2) {
      // Compute median(R-G) and median(B-G)
      d = c + 3;
      for (ii=0; ii < rr1*cc1; ii++) qix[ii][d] = qix[ii][c] - qix[ii][1];
      // Apply 3x3 median fileter
      for (rr=1; rr < rr1-1; rr++)
	for (cc=1; cc < cc1-1; cc++) {
	  rix = qix + rr*cc1 + cc;
	  // Assign 3x3 differential color values
	  p1 = rix[-w1-1][d]; p2 = rix[-w1][d]; p3 = rix[-w1+1][d];
	  p4 = rix[   -1][d]; p5 = rix[  0][d]; p6 = rix[    1][d];
	  p7 = rix[ w1-1][d]; p8 = rix[ w1][d]; p9 = rix[ w1+1][d];
	  // Sort for median of 9 values
	  PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9);
	  PIX_SORT(p1,p2); PIX_SORT(p4,p5); PIX_SORT(p7,p8);
	  PIX_SORT(p2,p3); PIX_SORT(p5,p6); PIX_SORT(p8,p9);
	  PIX_SORT(p1,p4); PIX_SORT(p6,p9); PIX_SORT(p5,p8);
	  PIX_SORT(p4,p7); PIX_SORT(p2,p5); PIX_SORT(p3,p6);
	  PIX_SORT(p5,p8); PIX_SORT(p5,p3); PIX_SORT(p7,p5);
	  PIX_SORT(p5,p3);
	  rix[0][4] = p5; }
      for (ii=0; ii < rr1*cc1; ii++) qix[ii][d] = qix[ii][4]; }
    // red/blue at GREEN pixel locations
    for (rr=0; rr < rr1; rr++)
      for (cc=(FC(rr,1)&1), c=FC(rr,cc+1); cc < cc1; cc+=2) {
	rix = qix + rr*cc1 + cc;
	rix[0][0] = rix[0][1] + rix[0][3];
	rix[0][2] = rix[0][1] + rix[0][5]; }
    // red/blue and green at BLUE/RED pixel locations
    for (rr=0; rr < rr1; rr++)
      for (cc=(FC(rr,0)&1), c=2-FC(rr,cc), d=c+3; cc < cc1; cc+=2) {
	rix = qix + rr*cc1 + cc;
	rix[0][c] = rix[0][1] + rix[0][d];
	rix[0][1] = 0.5*(rix[0][0] - rix[0][3] + rix[0][2] - rix[0][5]); } }
  // copy result back to image matrix
  for (row=0; row < height; row++)
    for (col=0, rr=row+ba; col < width; col++) {
      cc = col+ba;
      pix = image + row*width + col;
      rix = qix + rr*cc1 + cc;
      c = FC(row,col);
      if (gamma_apply) {
	for (ii=0; ii < 3; ii++)
	  if (ii != c) {
	    v0 = rix[0][ii];
	    if (v0 <= 0.04045)
	      v0 /= 12.92;
	    else
	      v0 = pow((v0 + 0.055)/1.055,2.4);
	    pix[0][ii] = CLIP((int)(65535.0*v0 + 0.5)); } }
      else
	for (ii=0; ii < 3; ii++)
	  if (ii != c)
	    pix[0][ii] = CLIP((int)(65535.0*rix[0][ii] + 0.5));
    }
  // Done
  free(buffer);
  t2 = clock();
  dt = ((double)(t2-t1)) / CLOCKS_PER_SEC;
#ifdef DCRAW_VERBOSE
  if (verbose) fprintf(stderr,_("\telapsed time     = %5.3fs\n"),dt);
#endif
}