mlib_status
mlib_m_sconv3x3_16nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl;
	__m64 *sp0, *sp1, *sp2, *dp;
	__m64 zero, _rnd;
	mlib_s32 shift, kerh_sum;
	mlib_s32 i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		dp = (__m64 *) dl;

		PREP_V();

		for (i = 0; i < width / 4; i++) {
			CONV_3x3();

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3();

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
예제 #2
0
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right)
{
  long datasize =   image.xsize * image.ysize * image.csize;
  datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);
  __m64*leftPix =  (__m64*)image.data;
  __m64*rightPix = (__m64*)right.data;

  __m64 l, r, b;
  __m64 mask = _mm_setr_pi8((unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF);
  __m64 zeros = _mm_set1_pi8((unsigned char)0x00);
  //format is U Y V Y
  if (m_direction) {
    while(datasize--){
      l=leftPix[datasize];
      r=rightPix[datasize];
      b=_mm_subs_pu8(l, r);
      b=_mm_and_si64(b, mask);
      b=_mm_cmpeq_pi32(b, zeros);
      r=_mm_and_si64(r, b);
      l=_mm_andnot_si64(b, l);

      leftPix[datasize]=_mm_or_si64(l, r);
    }
  } else {
    while(datasize--){
      l=leftPix[datasize];
      r=rightPix[datasize];
      b=_mm_subs_pu8(r, l);
      b=_mm_and_si64(b, mask);
      b=_mm_cmpeq_pi32(b, zeros);
      r=_mm_and_si64(r, b);
      l=_mm_andnot_si64(b, l);

      leftPix[datasize]=_mm_or_si64(l, r);
    }
  }
  _mm_empty();
}
예제 #3
0
void pix_background :: processRGBAMMX(imageStruct &image)
{
  long i,pixsize;
  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
  }
  m_reset=0;

  i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange,
				m_Yrange, m_Urange, m_Vrange, m_Arange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  __m64 newpix, oldpix, m1;

  while(i--){
    /* 7ops, 3memops */
    /* i have the feeling that this is not faster at all! 
     * even if i have the 3memops + ONLY 1 _mm_subs_pu8() 
     * i am equally slow as the generic code; 
     * adding the other instruction does not change much
     */
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset);
    m1    = _mm_subs_pu8     (m1, thresh);
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh
    m1    = _mm_andnot_si64(m1, newpix);

    *data++ = m1; 
  }
  _mm_empty();
}
mlib_status
mlib_ImageMinFilter7x7_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, ee, ff, r0, r1;
	__m64 g0, g1, g2, g3, g4, g5, g6, gg;
	__m64 h0, h1, h2, h3, h4, h5, h6, hh;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(KSIZE1 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;
	buff4 = buff3 + wid8;
	buff5 = buff4 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	for (j = 0; j < 3; j++) {
		sp0 = buff4;
		sp1 = buff5;
		sp6 = sl;
		sp7 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			sp0 += 8;
			sp1 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (j < 2) {
			buffT = buff0;
			buff0 = buff2;
			buff2 = buff4;
			buff4 = buffT;
			buffT = buff1;
			buff1 = buff3;
			buff3 = buff5;
			buff5 = buffT;
		}
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;
		sp7 = sl + slb;

/*
 *    line0:           aa
 *    line1:           bb
 *    line2:           cc
 *    line3:           dd
 *    line4:           ee
 *    line5:           ff
 *    line4:  g0 g1 g2 g3 g4 g5 g6
 *    line5:  h0 h1 h2 h3 h4 h5 h6
 */

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buff4;
		buff4 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buff5;
		buff5 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_conv5x5_u16nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker[5][5];
	__m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000;
	__m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h,
	    tmph;
	__m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l,
	    tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift, ind, ker_sum = 0;
	mlib_s32 row, wid4, i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * 2 * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (__m64 *) sl;
		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < wid4; i++) {
			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < width / 4; i++) {
			CONV_5x5(hi, i);

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5(hi, i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
예제 #6
0
mlib_status
mlib_m_sconv5x5_8nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
    __m64 *pbuff, *buff_arr[5];
    __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT;
    GET_SRC_DST_PARAMETERS(mlib_u8);
    __m64 hker0, hker1, hker2, hker3, hker4;
    __m64 vker0, vker1, vker2, vker3, vker4;
    __m64 s0, d0, d1, d2, prev0;
    __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo;
    __m64 zero = _m_zero;
    mlib_s32 shift, ind;
    mlib_s32 *sp;
    mlib_s32 row, wid4, i, j;

    width -= 4;
    height -= 4;
    width *= NCHAN;
    dl += 2 * (dll + NCHAN);

    wid4 = 2 * ((width + 7) / 8);
    pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4);

    GET_KERN();

    for (i = 0; i < 5; i++) {
        buff_arr[i] = pbuff + i * wid4;
    }

    for (j = 0; j < 4; j++) {
        buff4 = buff_arr[j];

        sp = (mlib_s32 *)sl;

        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d2, lo);

        for (i = 0; i < wid4; i++) {
            *(mlib_s32 *)&s0 = sp[i];

            PREP_5x5(lo, i);
        }

        sl += sll;
        ind++;
    }

    buff0 = buff_arr[0];
    buff1 = buff_arr[1];
    buff2 = buff_arr[2];
    buff3 = buff_arr[3];
    buff4 = buff_arr[4];

    for (row = 0; row < height; row++) {
        __m64 *sp = (__m64 *) sl;
        __m64 *dp = (__m64 *) dl;

        s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        UNPACK_SRC(d2, hi);

        for (i = 0; i < width / 8; i++) {
            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);

            dp[i] = _mm_packs_pu16(res_lo, res_hi);
        }

        if (width & 7) {
            __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7];

            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);
            res_hi = _mm_packs_pu16(res_lo, res_hi);

            dp[i] =
                _mm_or_si64(_mm_and_si64(mask, res_hi),
                            _mm_andnot_si64(mask, dp[i]));
        }

        buffT = buff0;
        buff0 = buff1;
        buff1 = buff2;
        buff2 = buff3;
        buff3 = buff4;
        buff4 = buffT;

        sl += sll;
        dl += dll;
    }

    _mm_empty();
    mlib_free(pbuff);

    return (MLIB_SUCCESS);
}
예제 #7
0
__m64 test54(__m64 a, __m64 b) {
  // CHECK: pandn
  return _mm_andnot_si64(a, b);
}
예제 #8
0
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   for (unsigned i = 0; i < nPix; i += 4) {

      if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {

         __m64 mm = *(__m64*)(m+i-2);
         __m64 uu = *(__m64*)(u+i-2);
         __m64 ll = *(__m64*)(l+i-2);
         __m64 md = _mm_slli_si64(mm,8);
         __m64 mf = _mm_srli_si64(mm,8);
         __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));

         __m64 e0, e1, v1, v2;

         e0 = _mm_cmpeq_pi8(md,uu);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,uu);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(u+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

         e0 = _mm_cmpeq_pi8(md,ll);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,ll);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(l+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

      } else {

         __m64 v1 = _m_from_int(*(unsigned*)(m+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         *(__m64*)(dst1 + 2*i) = v1;
         *(__m64*)(dst2 + 2*i) = v1;

      }

   }
}
예제 #9
0
/* *********************************************************** */
mlib_status
mlib_m_sconv3x3_8nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	dl += dll + 1;

	wid4 = (width + 7) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + wid4;

	for (j = 0; j < 2; j++) {
		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_3x3_1ch(lo, i);
		}

		sl += sll;

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < width / 8; i++) {
			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_hi, res_lo);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_hi, res_lo);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
예제 #10
0
mlib_status
mlib_ImageErode4_U16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* DILATE_FILTER */
{
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
	__m64 *dp0, *dp1;
	__m64 a1, b0, b1, b2, c0, c1, c2, d1, vv, h0, h1, r0, r1;
	__m64 mask, mask80;
	mlib_s32 i, j, tail;

	sl = (mlib_u8 *)src;
/* dst ptrs skip top j and left col */
	dl = (mlib_u8 *)dst + dlb + SSIZE;

	wid = (wid - 2) * SSIZE;
	tail = wid & 7;
	mask = ((__m64 *) mlib_mask64_arr)[tail];
	mask80 = mmx_from_int_dup(0x80008000);

	for (j = 0; j <= (hgt - 2 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = sl;
		sp1 = sp0 + slb;
		sp2 = sp1 + slb;
		sp3 = sp2 + slb;

/*
 *    line0:     a1
 *    line1:  b0 b1 b2
 *    line2:  c0 c1 c2
 *    line3:     d1
 */

		for (i = 0; i <= wid - 8; i += 8) {
			a1 = *(__m64 *) (sp0 + SSIZE);
			b0 = *(__m64 *) (sp1);
			b1 = *(__m64 *) (sp1 + SSIZE);
			b2 = *(__m64 *) (sp1 + 2 * SSIZE);
			c0 = *(__m64 *) (sp2);
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d1 = *(__m64 *) (sp3 + SSIZE);

			vv = C_COMP(b1, c1);
			h0 = C_COMP(b0, b2);
			h1 = C_COMP(c0, c2);

			r0 = C_COMP(vv, a1);
			r1 = C_COMP(vv, d1);
			r0 = C_COMP(r0, h0);
			r1 = C_COMP(r1, h1);

			(*dp0++) = r0;
			(*dp1++) = r1;
			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
		}

		if (tail) {
			a1 = *(__m64 *) (sp0 + SSIZE);
			b0 = *(__m64 *) (sp1);
			b1 = *(__m64 *) (sp1 + SSIZE);
			b2 = *(__m64 *) (sp1 + 2 * SSIZE);
			c0 = *(__m64 *) (sp2);
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d1 = *(__m64 *) (sp3 + SSIZE);

			vv = C_COMP(b1, c1);
			h0 = C_COMP(b0, b2);
			h1 = C_COMP(c0, c2);

			r0 = C_COMP(vv, a1);
			r1 = C_COMP(vv, d1);
			r0 = C_COMP(r0, h0);
			r1 = C_COMP(r1, h1);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(mask, r0),
			    _mm_andnot_si64(mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(mask, r1),
			    _mm_andnot_si64(mask, *dp1));
		}

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - 3)) {
		dp0 = (void *)dl;
		sp0 = sl;
		sp1 = sp0 + slb;
		sp2 = sp1 + slb;

		for (i = 0; i <= wid - 8; i += 8) {
			a1 = *(__m64 *) (sp0 + SSIZE);
			b0 = *(__m64 *) (sp1);
			b1 = *(__m64 *) (sp1 + SSIZE);
			b2 = *(__m64 *) (sp1 + 2 * SSIZE);
			c1 = *(__m64 *) (sp2 + SSIZE);

			vv = C_COMP(b1, c1);
			h0 = C_COMP(b0, b2);
			r0 = C_COMP(vv, a1);
			r0 = C_COMP(r0, h0);

			(*dp0++) = r0;
			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
		}

		if (tail) {
			a1 = *(__m64 *) (sp0 + SSIZE);
			b0 = *(__m64 *) (sp1);
			b1 = *(__m64 *) (sp1 + SSIZE);
			b2 = *(__m64 *) (sp1 + 2 * SSIZE);
			c1 = *(__m64 *) (sp2 + SSIZE);

			vv = C_COMP(b1, c1);
			h0 = C_COMP(b0, b2);
			r0 = C_COMP(vv, a1);
			r0 = C_COMP(r0, h0);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(mask, r0),
			    _mm_andnot_si64(mask, *dp0));
		}
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageMinFilter5x5_U8(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff,
	    r0, r1;
	__m64 e_mask, mask8080;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(4 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];
	mask8080 = mmx_from_int_dup(0x80808080);

	for (j = 0; j < 2; j++) {
		sp0 = buff0;
		sp1 = buff1;
		sp4 = sl;
		sp5 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			sp0 += 8;
			sp1 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;
		sp5 = sl + slb;

/*
 *    line0:        aa
 *    line1:        bb
 *    line2:        cc
 *    line3:        dd
 *    line4:  e0 e1 e2 e3 e4
 *    line5:  f0 f1 f2 f3 f4
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
예제 #12
0
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
#else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm4 = emm2;

   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);

   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);

   mm4 = mm2;
   mm5 = mm3;

   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);

   /* get the polynom selection mask for the sine */

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

#ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
#endif

   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);

   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);

   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
예제 #13
0
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
   __m128i emm0, emm2;
#else
   __m64 mm0, mm1, mm2, mm3;
#endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);

   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);


   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);

   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */

   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);

   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());

   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
#endif
   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);

   return y;
}
예제 #14
0
mlib_status
mlib_m_conv3x3_16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buff2, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9;
	__m64 d0, d1, d2, rr, tmpa, tmpb;
	__m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph;
	__m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	wid4 = (width + 3) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + 2 * wid4;
	buff2 = buff1 + 2 * wid4;

	for (j = 0; j < 2; j++) {
		sp = (__m64 *) sl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < wid4; i++) {
			PREP_3x3(i);
		}

		sl += sll;

		if (j == 0) {
			buffT = buff1;
			buff1 = buff0;
			buff0 = buffT;
		}
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < width / 4; i++) {
			CONV_3x3(i);
			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3(i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
예제 #15
0
__m64 test_mm_andnot_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_andnot_si64
  // CHECK: call x86_mmx @llvm.x86.mmx.pandn
  return _mm_andnot_si64(a, b);
}
예제 #16
0
void pix_background :: processYUVMMX(imageStruct &image)
{
  long pixsize;

  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  
  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    // return;
  }
  m_reset=0;

  int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange,
			  m_Urange, m_Yrange, m_Vrange, m_Yrange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  const __m64 black =_mm_set_pi8((unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80);

  __m64 newpix, oldpix, m1;

  while(i--){
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset); // to make thresh=0 work correctly
    m1    = _mm_subs_pu8     (m1, thresh);  // m1>thresh -> saturation -> 0
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh

    oldpix= black;
    oldpix= _mm_and_si64     (oldpix, m1);

    m1    = _mm_andnot_si64  (m1, newpix);
    m1    = _mm_or_si64      (m1, oldpix);

    *data++ = m1; 
  }
  _mm_empty();
}
mlib_status
mlib_m_sconv5x5_u16nw_3(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, hker3, hker4;
	__m64 vker0, vker1, vker2, vker3, vker4;
	__m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, rr, rh, rl;
	__m64 aa, bb, cc, zero, ker_off, mask8000;
	__m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp;
	mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0;
	mlib_s32 i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		sp3 = (__m64 *) (sl + 3 * sll);
		sp4 = (__m64 *) (sl + 4 * sll);
		dp = (__m64 *) dl;

		PREP_V();

		for (i = 0; i < width / 4; i++) {
			CONV_5x5();

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5();

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
예제 #18
0
mlib_status
mlib_m_conv5x5_8nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 ker[5][5];
	__m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc;
	__m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift, ind;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= (KSIZE - 1);
	height -= (KSIZE - 1);
	width *= NCHAN;
	dl += ((KSIZE - 1) / 2) * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d2, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d4, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		UNPACK_SRC(d2, hi);
		s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		UNPACK_SRC(d4, hi);

		for (i = 0; i < width / 8; i++) {
			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_lo, res_hi);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_lo, res_hi);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
예제 #19
0
mlib_status
mlib_m_sconv7x7_16nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6;
	__m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6;
	__m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh,
	    rl;
	__m64 zero, _rnd;
	__m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp;
	mlib_s32 shift, kerh_sum;
	mlib_s32 i, j;

	width -= KSIZE1;
	height -= KSIZE1;
	width *= NCHAN;
	dl += (KSIZE / 2) * (dll + NCHAN);

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		sp3 = (__m64 *) (sl + 3 * sll);
		sp4 = (__m64 *) (sl + 4 * sll);
		sp5 = (__m64 *) (sl + 5 * sll);
		sp6 = (__m64 *) (sl + 6 * sll);
		dp = (__m64 *) dl;

		PREP_V(v1);
		PREP_V(v2);
		PREP_V(v3);
		PREP_V(v4);
		PREP_V(v5);
		PREP_V(v6);

		for (i = 0; i < width / 4; i++) {
			CONV_7x7();

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_7x7();

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageMinFilter3x3_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *buff, *buff1;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - 2) * SSIZE;
	wid8 = (wid + 7) & ~7;
	buff = mlib_malloc(2 * wid8);
	buff1 = buff + wid8;

	sl = (mlib_u8 *)src;
/* dst ptrs skip top j and left col */
	dl = (mlib_u8 *)dst + dlb + SSIZE;

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	sp0 = buff;
	sp1 = buff1;
	sp2 = sl;
	sp3 = sp2 + slb;
	sl += 2 * slb;

	for (i = 0; i < wid; i += 8) {
		c0 = *(__m64 *) sp2;
		c1 = *(__m64 *) (sp2 + SSIZE);
		c2 = *(__m64 *) (sp2 + 2 * SSIZE);
		d0 = *(__m64 *) sp3;
		d1 = *(__m64 *) (sp3 + SSIZE);
		d2 = *(__m64 *) (sp3 + 2 * SSIZE);

		cc = C_COMP(c0, c1);
		dd = C_COMP(d0, d1);
		cc = C_COMP(cc, c2);
		dd = C_COMP(dd, d2);

		*(__m64 *) sp0 = cc;
		*(__m64 *) sp1 = dd;

		sp0 += 8;
		sp1 += 8;
		sp2 += 8;
		sp3 += 8;
	}

	for (j = 0; j <= (hgt - 2 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;
		sp3 = sp2 + slb;

/*
 *    line0:     aa
 *    line1:     bb
 *    line2:  c0 c1 c2
 *    line3:  d0 d1 d2
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - 3)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			cc = C_COMP(cc, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			c1 = C_COMP(c0, c1);
			cc = C_COMP(c1, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(buff);

	return (MLIB_SUCCESS);
}