Exemplo n.º 1
0
void *mlib_ImageCreateRowTable(mlib_image *img)
{
  mlib_u8  **rtable, *tline;
  mlib_s32 i, im_height, im_stride;

  if (img == NULL) return NULL;
  if (img -> state)  return img -> state;

  im_height = mlib_ImageGetHeight(img);
  im_stride = mlib_ImageGetStride(img);
  tline     = mlib_ImageGetData(img);
  rtable    = mlib_malloc((3 + im_height)*sizeof(mlib_u8 *));

  if (rtable == NULL || tline == NULL) return NULL;

  rtable[0] = 0;
  rtable[1] = (mlib_u8*)((void **)rtable + 1);
  rtable[2 + im_height] = (mlib_u8*)((void **)rtable + 1);
  for (i = 0; i < im_height; i++) {
    rtable[i+2] = tline;
    tline    += im_stride;
  }

  img -> state = ((void **)rtable + 2);
  return img -> state;
}
Exemplo n.º 2
0
mlib_image *mlib_ImageCreateStruct(mlib_type  type,
                                   mlib_s32   channels,
                                   mlib_s32   width,
                                   mlib_s32   height,
                                   mlib_s32   stride,
                                   const void *data)
{
  mlib_image *image;
  if (stride <= 0) {
    return NULL;
  }

  image = (mlib_image *)mlib_malloc(sizeof(mlib_image));
  if (image == NULL) {
    return NULL;
  }

  if (mlib_ImageSet(image, type, channels, width, height, stride, data) == NULL) {
    mlib_free(image);
    image = NULL;
  }

  return image;
}
mlib_status
mlib_ImageMinFilter7x7_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, ee, ff, r0, r1;
	__m64 g0, g1, g2, g3, g4, g5, g6, gg;
	__m64 h0, h1, h2, h3, h4, h5, h6, hh;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(KSIZE1 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;
	buff4 = buff3 + wid8;
	buff5 = buff4 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	for (j = 0; j < 3; j++) {
		sp0 = buff4;
		sp1 = buff5;
		sp6 = sl;
		sp7 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			sp0 += 8;
			sp1 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (j < 2) {
			buffT = buff0;
			buff0 = buff2;
			buff2 = buff4;
			buff4 = buffT;
			buffT = buff1;
			buff1 = buff3;
			buff3 = buff5;
			buff5 = buffT;
		}
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;
		sp7 = sl + slb;

/*
 *    line0:           aa
 *    line1:           bb
 *    line2:           cc
 *    line3:           dd
 *    line4:           ee
 *    line5:           ff
 *    line4:  g0 g1 g2 g3 g4 g5 g6
 *    line5:  h0 h1 h2 h3 h4 h5 h6
 */

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buff4;
		buff4 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buff5;
		buff5 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_conv5x5_u16nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker[5][5];
	__m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000;
	__m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h,
	    tmph;
	__m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l,
	    tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift, ind, ker_sum = 0;
	mlib_s32 row, wid4, i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * 2 * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (__m64 *) sl;
		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < wid4; i++) {
			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < width / 4; i++) {
			CONV_5x5(hi, i);

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5(hi, i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Exemplo n.º 5
0
mlib_status
mlib_m_sconv5x5_8nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
    __m64 *pbuff, *buff_arr[5];
    __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT;
    GET_SRC_DST_PARAMETERS(mlib_u8);
    __m64 hker0, hker1, hker2, hker3, hker4;
    __m64 vker0, vker1, vker2, vker3, vker4;
    __m64 s0, d0, d1, d2, prev0;
    __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo;
    __m64 zero = _m_zero;
    mlib_s32 shift, ind;
    mlib_s32 *sp;
    mlib_s32 row, wid4, i, j;

    width -= 4;
    height -= 4;
    width *= NCHAN;
    dl += 2 * (dll + NCHAN);

    wid4 = 2 * ((width + 7) / 8);
    pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4);

    GET_KERN();

    for (i = 0; i < 5; i++) {
        buff_arr[i] = pbuff + i * wid4;
    }

    for (j = 0; j < 4; j++) {
        buff4 = buff_arr[j];

        sp = (mlib_s32 *)sl;

        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d2, lo);

        for (i = 0; i < wid4; i++) {
            *(mlib_s32 *)&s0 = sp[i];

            PREP_5x5(lo, i);
        }

        sl += sll;
        ind++;
    }

    buff0 = buff_arr[0];
    buff1 = buff_arr[1];
    buff2 = buff_arr[2];
    buff3 = buff_arr[3];
    buff4 = buff_arr[4];

    for (row = 0; row < height; row++) {
        __m64 *sp = (__m64 *) sl;
        __m64 *dp = (__m64 *) dl;

        s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        UNPACK_SRC(d2, hi);

        for (i = 0; i < width / 8; i++) {
            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);

            dp[i] = _mm_packs_pu16(res_lo, res_hi);
        }

        if (width & 7) {
            __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7];

            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);
            res_hi = _mm_packs_pu16(res_lo, res_hi);

            dp[i] =
                _mm_or_si64(_mm_and_si64(mask, res_hi),
                            _mm_andnot_si64(mask, dp[i]));
        }

        buffT = buff0;
        buff0 = buff1;
        buff1 = buff2;
        buff2 = buff3;
        buff3 = buff4;
        buff4 = buffT;

        sl += sll;
        dl += dll;
    }

    _mm_empty();
    mlib_free(pbuff);

    return (MLIB_SUCCESS);
}
mlib_status mlib_AffineEdges(mlib_affine_param *param,
                             const mlib_image  *dst,
                             const mlib_image  *src,
                             void              *buff_lcl,
                             mlib_s32          buff_size,
                             mlib_s32          kw,
                             mlib_s32          kh,
                             mlib_s32          kw1,
                             mlib_s32          kh1,
                             mlib_edge         edge,
                             const mlib_d64    *mtx,
                             mlib_s32          shiftx,
                             mlib_s32          shifty)
{
  mlib_u8 *buff = buff_lcl;
  mlib_u8 **lineAddr = param->lineAddr;
  mlib_s32 srcWidth, dstWidth, srcHeight, dstHeight, srcYStride, dstYStride;
  mlib_s32 *leftEdges, *rightEdges, *xStarts, *yStarts, bsize0, bsize1 = 0;
  mlib_u8 *srcData, *dstData;
  mlib_u8 *paddings;
  void *warp_tbl = NULL;
  mlib_s32 yStart = 0, yFinish = -1, dX, dY;

  mlib_d64 xClip, yClip, wClip, hClip;
  mlib_d64 delta = 0.;
  mlib_d64 minX, minY, maxX, maxY;

  mlib_d64 coords[4][2];
  mlib_d64 a = mtx[0], b = mtx[1], tx = mtx[2], c = mtx[3], d = mtx[4], ty = mtx[5];
  mlib_d64 a2, b2, tx2, c2, d2, ty2;
  mlib_d64 dx, dy, div;
  mlib_s32 sdx, sdy;
  mlib_d64 dTop;
  mlib_d64 val0;
  mlib_s32 top, bot;
  mlib_s32 topIdx, max_xsize = 0;
  mlib_s32 i, j, t;

  srcData = mlib_ImageGetData(src);
  dstData = mlib_ImageGetData(dst);
  srcWidth = mlib_ImageGetWidth(src);
  srcHeight = mlib_ImageGetHeight(src);
  dstWidth = mlib_ImageGetWidth(dst);
  dstHeight = mlib_ImageGetHeight(dst);
  srcYStride = mlib_ImageGetStride(src);
  dstYStride = mlib_ImageGetStride(dst);
  paddings = mlib_ImageGetPaddings(src);

  if (srcWidth >= (1 << 15) || srcHeight >= (1 << 15)) {
    return MLIB_FAILURE;
  }

  div = a * d - b * c;

  if (div == 0.0) {
    return MLIB_FAILURE;
  }

  bsize0 = (dstHeight * sizeof(mlib_s32) + 7) & ~7;

  if (lineAddr == NULL) {
    bsize1 = ((srcHeight + 4 * kh) * sizeof(mlib_u8 *) + 7) & ~7;
  }

  param->buff_malloc = NULL;

  if ((4 * bsize0 + bsize1) > buff_size) {
    buff = param->buff_malloc = mlib_malloc(4 * bsize0 + bsize1);

    if (buff == NULL)
      return MLIB_FAILURE;
  }

  leftEdges = (mlib_s32 *) (buff);
  rightEdges = (mlib_s32 *) (buff += bsize0);
  xStarts = (mlib_s32 *) (buff += bsize0);
  yStarts = (mlib_s32 *) (buff += bsize0);

  if (lineAddr == NULL) {
    mlib_u8 *srcLinePtr = srcData;
    lineAddr = (mlib_u8 **) (buff += bsize0);
    for (i = 0; i < 2 * kh; i++)
      lineAddr[i] = srcLinePtr;
    lineAddr += 2 * kh;
    for (i = 0; i < srcHeight - 1; i++) {
      lineAddr[i] = srcLinePtr;
      srcLinePtr += srcYStride;
    }

    for (i = srcHeight - 1; i < srcHeight + 2 * kh; i++)
      lineAddr[i] = srcLinePtr;
  }

  if ((mlib_s32) edge < 0) {                               /* process edges */
    minX = 0;
    minY = 0;
    maxX = srcWidth;
    maxY = srcHeight;
  }
  else {

    if (kw > 1)
      delta = -0.5;                                        /* for MLIB_NEAREST filter delta = 0. */

    minX = (kw1 - delta);
    minY = (kh1 - delta);
    maxX = srcWidth - ((kw - 1) - (kw1 - delta));
    maxY = srcHeight - ((kh - 1) - (kh1 - delta));

    if (edge == MLIB_EDGE_SRC_PADDED) {
      if (minX < paddings[0])
        minX = paddings[0];

      if (minY < paddings[1])
        minY = paddings[1];

      if (maxX > (srcWidth - paddings[2]))
        maxX = srcWidth - paddings[2];

      if (maxY > (srcHeight - paddings[3]))
        maxY = srcHeight - paddings[3];
    }
  }

  xClip = minX;
  yClip = minY;
  wClip = maxX;
  hClip = maxY;

/*
 *   STORE_PARAM(param, src);
 *   STORE_PARAM(param, dst);
 */
  param->src = (void *)src;
  param->dst = (void *)dst;
  STORE_PARAM(param, lineAddr);
  STORE_PARAM(param, dstData);
  STORE_PARAM(param, srcYStride);
  STORE_PARAM(param, dstYStride);
  STORE_PARAM(param, leftEdges);
  STORE_PARAM(param, rightEdges);
  STORE_PARAM(param, xStarts);
  STORE_PARAM(param, yStarts);
  STORE_PARAM(param, max_xsize);
  STORE_PARAM(param, yStart);
  STORE_PARAM(param, yFinish);
  STORE_PARAM(param, warp_tbl);

  if ((xClip >= wClip) || (yClip >= hClip)) {
    return MLIB_SUCCESS;
  }

  a2 = d;
  b2 = -b;
  tx2 = (-d * tx + b * ty);
  c2 = -c;
  d2 = a;
  ty2 = (c * tx - a * ty);

  dx = a2;
  dy = c2;

  tx -= 0.5;
  ty -= 0.5;

  coords[0][0] = xClip * a + yClip * b + tx;
  coords[0][1] = xClip * c + yClip * d + ty;

  coords[2][0] = wClip * a + hClip * b + tx;
  coords[2][1] = wClip * c + hClip * d + ty;

  if (div > 0) {
    coords[1][0] = wClip * a + yClip * b + tx;
    coords[1][1] = wClip * c + yClip * d + ty;

    coords[3][0] = xClip * a + hClip * b + tx;
    coords[3][1] = xClip * c + hClip * d + ty;
  }
  else {
    coords[3][0] = wClip * a + yClip * b + tx;
    coords[3][1] = wClip * c + yClip * d + ty;

    coords[1][0] = xClip * a + hClip * b + tx;
    coords[1][1] = xClip * c + hClip * d + ty;
  }

  topIdx = 0;
  for (i = 1; i < 4; i++) {

    if (coords[i][1] < coords[topIdx][1])
      topIdx = i;
  }

  dTop = coords[topIdx][1];
  val0 = dTop;
  SAT32(top);
  bot = -1;

  if (top >= dstHeight) {
    return MLIB_SUCCESS;
  }

  if (dTop >= 0.0) {
    mlib_d64 xLeft, xRight, x;
    mlib_s32 nextIdx;

    if (dTop == top) {
      xLeft = coords[topIdx][0];
      xRight = coords[topIdx][0];
      nextIdx = (topIdx + 1) & 0x3;

      if (dTop == coords[nextIdx][1]) {
        x = coords[nextIdx][0];
        xLeft = (xLeft <= x) ? xLeft : x;
        xRight = (xRight >= x) ? xRight : x;
      }

      nextIdx = (topIdx - 1) & 0x3;

      if (dTop == coords[nextIdx][1]) {
        x = coords[nextIdx][0];
        xLeft = (xLeft <= x) ? xLeft : x;
        xRight = (xRight >= x) ? xRight : x;
      }

      val0 = xLeft;
      SAT32(t);
      leftEdges[top] = (t >= xLeft) ? t : ++t;

      if (xLeft >= MLIB_S32_MAX)
        leftEdges[top] = MLIB_S32_MAX;

      val0 = xRight;
      SAT32(rightEdges[top]);
    }
    else
      top++;
  }
  else
    top = 0;

  for (i = 0; i < 2; i++) {
    mlib_d64 dY1 = coords[(topIdx - i) & 0x3][1];
    mlib_d64 dX1 = coords[(topIdx - i) & 0x3][0];
    mlib_d64 dY2 = coords[(topIdx - i - 1) & 0x3][1];
    mlib_d64 dX2 = coords[(topIdx - i - 1) & 0x3][0];
    mlib_d64 x = dX1, slope = (dX2 - dX1) / (dY2 - dY1);
    mlib_s32 y1;
    mlib_s32 y2;

    if (dY1 == dY2)
      continue;

    if (dY1 < 0.0)
      y1 = 0;
    else {
      val0 = dY1 + 1;
      SAT32(y1);
    }

    val0 = dY2;
    SAT32(y2);

    if (y2 >= dstHeight)
      y2 = (mlib_s32) (dstHeight - 1);

    x += slope * (y1 - dY1);
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (j = y1; j <= y2; j++) {
      val0 = x;
      SAT32(t);
      leftEdges[j] = (t >= x) ? t : ++t;

      if (x >= MLIB_S32_MAX)
        leftEdges[j] = MLIB_S32_MAX;
      x += slope;
    }
  }

  for (i = 0; i < 2; i++) {
    mlib_d64 dY1 = coords[(topIdx + i) & 0x3][1];
    mlib_d64 dX1 = coords[(topIdx + i) & 0x3][0];
    mlib_d64 dY2 = coords[(topIdx + i + 1) & 0x3][1];
    mlib_d64 dX2 = coords[(topIdx + i + 1) & 0x3][0];
    mlib_d64 x = dX1, slope = (dX2 - dX1) / (dY2 - dY1);
    mlib_s32 y1;
    mlib_s32 y2;

    if (dY1 == dY2)
      continue;

    if (dY1 < 0.0)
      y1 = 0;
    else {
      val0 = dY1 + 1;
      SAT32(y1);
    }

    val0 = dY2;
    SAT32(y2);

    if (y2 >= dstHeight)
      y2 = (mlib_s32) (dstHeight - 1);

    x += slope * (y1 - dY1);
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (j = y1; j <= y2; j++) {
      val0 = x;
      SAT32(rightEdges[j]);
      x += slope;
    }

    bot = y2;
  }

  {
    mlib_d64 dxCl = xClip * div;
    mlib_d64 dyCl = yClip * div;
    mlib_d64 dwCl = wClip * div;
    mlib_d64 dhCl = hClip * div;

    mlib_s32 xCl = (mlib_s32) (xClip + delta);
    mlib_s32 yCl = (mlib_s32) (yClip + delta);
    mlib_s32 wCl = (mlib_s32) (wClip + delta);
    mlib_s32 hCl = (mlib_s32) (hClip + delta);

    /*
     * mlib_s32 xCl = (mlib_s32)(xClip + delta);
     * mlib_s32 yCl = (mlib_s32)(yClip + delta);
     * mlib_s32 wCl = (mlib_s32)(wClip);
     * mlib_s32 hCl = (mlib_s32)(hClip);
     */

    if (edge == MLIB_EDGE_SRC_PADDED) {
      xCl = kw1;
      yCl = kh1;
      wCl = (mlib_s32) (srcWidth - ((kw - 1) - kw1));
      hCl = (mlib_s32) (srcHeight - ((kh - 1) - kh1));
    }

    div = 1.0 / div;

    sdx = (mlib_s32) (a2 * div * (1 << shiftx));
    sdy = (mlib_s32) (c2 * div * (1 << shifty));

    if (div > 0) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (i = top; i <= bot; i++) {
        mlib_s32 xLeft = leftEdges[i];
        mlib_s32 xRight = rightEdges[i];
        mlib_s32 xs, ys, x_e, y_e, x_s, y_s;
        mlib_d64 dxs, dys, dxe, dye;
        mlib_d64 xl, ii, xr;

        xLeft = (xLeft < 0) ? 0 : xLeft;
        xRight = (xRight >= dstWidth) ? (mlib_s32) (dstWidth - 1) : xRight;

        xl = xLeft + 0.5;
        ii = i + 0.5;
        xr = xRight + 0.5;
        dxs = xl * a2 + ii * b2 + tx2;
        dys = xl * c2 + ii * d2 + ty2;

        if ((dxs < dxCl) || (dxs >= dwCl) || (dys < dyCl) || (dys >= dhCl)) {
          dxs += dx;
          dys += dy;
          xLeft++;

          if ((dxs < dxCl) || (dxs >= dwCl) || (dys < dyCl) || (dys >= dhCl))
            xRight = -1;
        }

        dxe = xr * a2 + ii * b2 + tx2;
        dye = xr * c2 + ii * d2 + ty2;

        if ((dxe < dxCl) || (dxe >= dwCl) || (dye < dyCl) || (dye >= dhCl)) {
          dxe -= dx;
          dye -= dy;
          xRight--;

          if ((dxe < dxCl) || (dxe >= dwCl) || (dye < dyCl) || (dye >= dhCl))
            xRight = -1;
        }

        xs = (mlib_s32) ((dxs * div + delta) * (1 << shiftx));
        x_s = xs >> shiftx;

        ys = (mlib_s32) ((dys * div + delta) * (1 << shifty));
        y_s = ys >> shifty;

        if (x_s < xCl)
          xs = (xCl << shiftx);
        else if (x_s >= wCl)
          xs = ((wCl << shiftx) - 1);

        if (y_s < yCl)
          ys = (yCl << shifty);
        else if (y_s >= hCl)
          ys = ((hCl << shifty) - 1);

        if (xRight >= xLeft) {
          x_e = ((xRight - xLeft) * sdx + xs) >> shiftx;
          y_e = ((xRight - xLeft) * sdy + ys) >> shifty;

          if ((x_e < xCl) || (x_e >= wCl)) {
            if (sdx > 0)
              sdx -= 1;
            else
              sdx += 1;
          }

          if ((y_e < yCl) || (y_e >= hCl)) {
            if (sdy > 0)
              sdy -= 1;
            else
              sdy += 1;
          }
        }

        leftEdges[i] = xLeft;
        rightEdges[i] = xRight;
        xStarts[i] = xs;
        yStarts[i] = ys;

        if ((xRight - xLeft + 1) > max_xsize)
          max_xsize = (xRight - xLeft + 1);
      }
    }
    else {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (i = top; i <= bot; i++) {
Exemplo n.º 7
0
mlib_status
mlib_m_conv5x5_8nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 ker[5][5];
	__m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc;
	__m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift, ind;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= (KSIZE - 1);
	height -= (KSIZE - 1);
	width *= NCHAN;
	dl += ((KSIZE - 1) / 2) * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d2, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d4, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		UNPACK_SRC(d2, hi);
		s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		UNPACK_SRC(d4, hi);

		for (i = 0; i < width / 8; i++) {
			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_lo, res_hi);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_lo, res_hi);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Exemplo n.º 8
0
mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
                           const mlib_image *src,
                           const mlib_d64   *ker,
                           mlib_s32         m,
                           mlib_s32         n,
                           mlib_s32         dm,
                           mlib_s32         dn,
                           mlib_s32         cmask)
{
  DTYPE k0, k1, k2, k3, k4, k5, k6, *sp;
  DTYPE p0, p1, p2, p3, p4, p5, p6, p7;
  mlib_s32 l, off, kw;
  DEF_VARS(DTYPE);
  mlib_s32 chan2 = chan1 + chan1;
  mlib_s32 chan3 = chan1 + chan2;

#ifdef TYPE_DOUBLE
  const mlib_d64 *k = ker;
#else
  mlib_f32 k_arr[MAX_NM], *k = k_arr;

  if (n*m > MAX_NM) {
    k = mlib_malloc(n*m*sizeof(mlib_f32));

    if (k == NULL) return MLIB_FAILURE;
  }

  for (i = 0; i < n*m; i++) k[i] = (mlib_f32)ker[i];
#endif /* TYPE_DOUBLE */

  if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask);

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*chan1;

  for (c = 0; c < chan1; c++) {
    if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

    for (j = 0; j < hgt; j++) {
      const DTYPE *pk = k;

      for (l = 0; l < n; l++) {
        DTYPE *sp0 = sl + l*sll;

        for (off = 0; off < m; off += kw, pk += kw, sp0 += chan1) {
          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;

          p2 = sp0[0]; p3 = sp0[chan1]; p4 = sp0[chan2];
          sp0 += chan3;
          p5 = sp0[0]; p6 = sp0[chan1]; p7 = sp0[chan2];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];

          dp = dl;

          if (kw == 7) {
            sp = sp0 += chan3;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {
            sp = sp0 += chan2;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[0]; p6 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[0]; p6 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {
            sp = sp0 += chan1;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = sp[0]; p5 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = sp[0]; p5 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            sp = sp0;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = sp[0]; p4 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = sp[0]; p4 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {
            sp = sp0 -= chan1;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = sp[0]; p3 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = sp[0]; p3 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
                dp += chan2;
              }
            }

          } else { /* kw == 2 */
            sp = sp0 -= chan2;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = sp[0]; p2 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1;
                dp[chan1] = p1*k0 + p2*k1;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = sp[0]; p2 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1;
                dp[chan1] += p1*k0 + p2*k1;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */

      if (wid & 1) {
        DTYPE *sp0 = sl + i*chan1, s = 0;
        const DTYPE *pk = k;
        mlib_s32 x;

        for (l = 0; l < n; l++) {
          DTYPE *sp = sp0 + l*sll;

          for (x = 0; x < m; x++) s += sp[x*chan1] * (*pk++);
        }

        dp[0] = s;
      }

      /* next line */
      sl += sll;
      dl += dll;
    }
  }

#ifndef TYPE_DOUBLE

  if (k != k_arr) mlib_free(k);
#endif /* TYPE_DOUBLE */

  return MLIB_SUCCESS;
}
Exemplo n.º 9
0
mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
                             const mlib_image *src,
                             const mlib_s32   *kernel,
                             mlib_s32         m,
                             mlib_s32         n,
                             mlib_s32         dm,
                             mlib_s32         dn,
                             mlib_s32         scale,
                             mlib_s32         cmask)
{
  mlib_s32 buff[BUFF_SIZE], *buffd = buff;
  mlib_s32 l, off, kw;
  mlib_s32 d0, d1, shift1, shift2;
  mlib_s32 k0, k1, k2, k3, k4, k5, k6;
  mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
  DTYPE    *adr_src, *sl, *sp = NULL;
  DTYPE    *adr_dst, *dl, *dp = NULL;
  mlib_s32 wid, hgt, sll, dll;
  mlib_s32 nchannel, chan1;
  mlib_s32 i, j, c;
  mlib_s32 chan2;
  mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
  GET_SRC_DST_PARAMETERS(DTYPE);

#if IMG_TYPE != 1
  shift1 = 16;
#else
  shift1 = 8;
#endif /* IMG_TYPE != 1 */
  shift2 = scale - shift1;

  chan1 = nchannel;
  chan2 = chan1 + chan1;

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*nchannel;

  if (wid > BUFF_SIZE) {
    buffd = mlib_malloc(sizeof(mlib_s32)*wid);

    if (buffd == NULL) return MLIB_FAILURE;
  }

  if (m*n > MAX_N*MAX_N) {
    k = mlib_malloc(sizeof(mlib_s32)*(m*n));

    if (k == NULL) {
      if (buffd != buff) mlib_free(buffd);
      return MLIB_FAILURE;
    }
  }

  for (i = 0; i < m*n; i++) {
    k[i] = kernel[i] >> shift1;
  }

  for (c = 0; c < nchannel; c++) {
    if (!(cmask & (1 << (nchannel - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (i = 0; i < wid; i++) buffd[i] = 0;

    for (j = 0; j < hgt; j++) {
      mlib_s32 *pk = k;

      for (l = 0; l < n; l++) {
        DTYPE *sp0 = sl + l*sll;

        for (off = 0; off < m;) {
          sp = sp0 + off*chan1;
          dp = dl;

          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;
          off += kw;

          p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
          p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
          pk += kw;

          sp += (kw - 1)*chan1;

          if (kw == 7) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 2) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1;
                buffd[i + 1] += p1*k0 + p2*k1;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else /*if (kw == 1)*/ {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                buffd[i    ] += p0*k0;
                buffd[i + 1] += p1*k0;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                d0 = (p0*k0 + buffd[i    ]);
                d1 = (p1*k0 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */
      for (; i < wid; i++) {
        mlib_s32 *pk = k, s = 0;
        mlib_s32 x;

        for (l = 0; l < n; l++) {
          sp = sl + l*sll + i*chan1;

          for (x = 0; x < m; x++) {
            s += sp[0] * pk[0];
            sp += chan1;
            pk ++;
          }
        }

        STORE_RES(dp[0], s);

        sp += chan1;
        dp += chan1;
      }

      sl += sll;
      dl += dll;
    }
  }

  if (buffd != buff) mlib_free(buffd);
  if (k != k_locl) mlib_free(k);

  return MLIB_SUCCESS;
}
mlib_status mlib_ImageAffineEdgeExtend_BL(mlib_affine_param *param,
                                          mlib_affine_param *param_e,
                                          const void        *colormap)
{
  GET_EDGE_PARAMS();
  mlib_d64 scale = 1.0 / (mlib_d64) MLIB_PREC;
  mlib_s32 xDelta, yDelta, xFlag, yFlag;
  mlib_d64 t, u, pix0;
  mlib_d64 a00, a01, a10, a11;

  if (colormap != NULL) {
    mlib_s32 max_xsize = param_e->max_xsize;
    mlib_type ltype = mlib_ImageGetLutType(colormap);
    mlib_d64 *plut = (mlib_d64 *) mlib_ImageGetLutDoubleData(colormap);
    void *buff;

    channels = mlib_ImageGetLutChannels(colormap);
    plut -= channels * mlib_ImageGetLutOffset(colormap);

    if (max_xsize == 0) {
      return MLIB_SUCCESS;
    }

    if (ltype == MLIB_BYTE) {
      buff = mlib_malloc(channels * max_xsize);
    }
    else {
      buff = mlib_malloc(channels * max_xsize * sizeof(mlib_s16));
    }

    if (buff == NULL)
      return MLIB_FAILURE;

    switch (ltype) {
      case MLIB_BYTE:
        switch (type) {
          case MLIB_BYTE:
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_u8);
            break;

          case MLIB_SHORT:
            srcStride >>= 1;
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_s16);
            break;
        }

        break;

      case MLIB_SHORT:
        switch (type) {
          case MLIB_BYTE:
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_u8);
            break;

          case MLIB_SHORT:
            srcStride >>= 1;
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_s16);
            break;
        }

        break;
    }

    mlib_free(buff);

    return MLIB_SUCCESS;
  }
Exemplo n.º 11
0
mlib_status
mlib_ImageMinFilter5x5_U8(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff,
	    r0, r1;
	__m64 e_mask, mask8080;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(4 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];
	mask8080 = mmx_from_int_dup(0x80808080);

	for (j = 0; j < 2; j++) {
		sp0 = buff0;
		sp1 = buff1;
		sp4 = sl;
		sp5 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			sp0 += 8;
			sp1 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;
		sp5 = sl + slb;

/*
 *    line0:        aa
 *    line1:        bb
 *    line2:        cc
 *    line3:        dd
 *    line4:  e0 e1 e2 e3 e4
 *    line5:  f0 f1 f2 f3 f4
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Exemplo n.º 12
0
mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
                                  const mlib_image *src,
                                  mlib_s32         m,
                                  mlib_s32         n,
                                  mlib_s32         dm,
                                  mlib_s32         dn,
                                  const mlib_s32   *kern,
                                  mlib_s32         scale,
                                  mlib_s32         cmask)
{
  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
  mlib_d64 dd, d0, d1;
  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
  mlib_u8 *sl, *sp, *dl;
  mlib_s32 hgt = mlib_ImageGetHeight(src);
  mlib_s32 wid = mlib_ImageGetWidth(src);
  mlib_s32 sll = mlib_ImageGetStride(src);
  mlib_s32 dll = mlib_ImageGetStride(dst);
  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
  mlib_d64 *pbuff, *dp;
  mlib_f32 *karr = (mlib_f32 *) kern;
  mlib_s32 gsr_scale = (31 - scale) << 3;
  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
  mlib_s32 i, j, l, chan, testchan;
  mlib_s32 nchan = mlib_ImageGetChannels(dst);
  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);

  if (n > MAX_N) {
    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));

    if (buffs == NULL)
      return MLIB_FAILURE;
  }

  buff = buffs + 2 * (n + 1);

  adr_dst += dn * dll + dm * nchan;

  ssize = wid;
  dsize = (ssize + 7) / 8;
  esize = dsize + 4;
  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));

  if (pbuff == NULL) {
    if (buffs != buffs_local)
      mlib_free(buffs);
    return MLIB_FAILURE;
  }

  for (i = 0; i < (n + 1); i++)
    buffs[i] = pbuff + i * esize;
  for (i = 0; i < (n + 1); i++)
    buffs[(n + 1) + i] = buffs[i];
  buffd = buffs[n] + esize;
  buffe = buffd + 2 * esize;

  hgt -= (n - 1);
  xsize = ssize - (m - 1);

  vis_write_gsr(gsr_scale + 7);

  if (nchan == 2) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
  }
  else if (nchan == 3) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
  }
  else {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
  }

  testchan = 1;
  for (chan = 0; chan < nchan; chan++) {
    buff_ind = 0;
    sl = adr_src;
    dl = adr_dst;

    if ((cmask & testchan) == 0) {
      testchan <<= 1;
      continue;
    }

    for (l = 0; l < n; l++) {
      mlib_d64 *buffn = buffs[l];
      sp = sl + l * sll;

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
    }

    /* init buffer */
#pragma pipeloop(0)
    for (i = 0; i < (xsize + 7) / 8; i++) {
      buffd[2 * i] = drnd;
      buffd[2 * i + 1] = drnd;
    }

    for (j = 0; j < hgt; j++) {
      mlib_d64 **buffc = buffs + buff_ind;
      mlib_f32 *pk = karr, k0, k1, k2, k3;
      sp = sl + n * sll;

      for (l = 0; l < n; l++) {
        buff[l] = buffc[l];
      }

      buffn = buffc[n];

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);

      ik_last = (m - 1);

      for (jk = 0; jk < n; jk += jk_size) {
        jk_size = n - jk;

        if (jk_size >= 6)
          jk_size = 4;

        if (jk_size == 5)
          jk_size = 3;

        coff = 0;

        if (jk_size == 1) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];

            doff = coff / 8;
            buff0 = buff[jk] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s01 = buff0[i + 1];
              s0 = vis_faligndata(s00, s01);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d1 = vis_fpadd16(d01, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += m;
        }
        else if (jk_size == 2) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
            s11 = buff1[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s10 = s11;
              s01 = buff0[i + 1];
              s11 = buff1[i + 1];
              s0 = vis_faligndata(s00, s01);
              s1 = vis_faligndata(s10, s11);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d0 = vis_fpadd16(d10, d0);
              d1 = vis_fpadd16(d01, d1);
              d1 = vis_fpadd16(d11, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += 2 * m;
        }
        else if (jk_size == 3) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 3 * m;
        }
        else {                              /* jk_size == 4 */

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];
            k3 = pk[ik + 3 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;
            buff3 = buff[jk + 3] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {

#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];
                s3 = buff3[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);
                s3 = vis_faligndata(s30, s31);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 4 * m;
        }
      }

      /*****************************************
       *****************************************
       **          Final iteration            **
       *****************************************
       *****************************************/

      jk_size = n;

      if (jk_size >= 6)
        jk_size = 4;

      if (jk_size == 5)
        jk_size = 3;

      k0 = karr[ik_last];
      k1 = karr[ik_last + m];
      k2 = karr[ik_last + 2 * m];
      k3 = karr[ik_last + 3 * m];

      off = ik_last;
      doff = off / 8;
      off &= 7;
      buff0 = buff[0] + doff;
      buff1 = buff[1] + doff;
      buff2 = buff[2] + doff;
      buff3 = buff[3] + doff;
      vis_write_gsr(gsr_scale + off);

      if (jk_size == 1) {
        dp = buffe;

        s01 = buff0[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s01 = buff0[i + 1];
          s0 = vis_faligndata(s00, s01);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d1 = vis_fpadd16(d1, d01);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 2) {
        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 3) {

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else {                                /* if (jk_size == 4) */

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
        s31 = buff3[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s30 = s31;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s31 = buff3[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);
          s3 = vis_faligndata(s30, s31);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d0 = vis_fpadd16(d0, d30);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);
          d1 = vis_fpadd16(d1, d31);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }

      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);

      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= (n + 1))
        buff_ind = 0;
    }

    testchan <<= 1;
  }

  mlib_free(pbuff);

  if (buffs != buffs_local)
    mlib_free(buffs);

  return MLIB_SUCCESS;
}
Exemplo n.º 13
0
mlib_status
mlib_m_ImageInitInterpTableAffine_S16(
    mlib_interp_table * table,
    mlib_s32 nchan)
{
	mlib_s32 width, height, width_bits, height_bits, vis_width_bits,
	    vis_height_bits;
	mlib_s32 subsampleBitsH, subsampleBitsV;
	mlib_s32 i, j, c, scale, num_copy, num_copy_old;
	mlib_s32 isum;
	mlib_s32 max_scale, min_scale, scaleh, scalev;
	mlib_s32 norm_scale_v, norm_scale_h;
	mlib_d64 dscale, *dataH, *dataV;
	mlib_d64 **ptr_tablex, *tablex, *tablex_old, *tabley;
	mlib_d64 max, d;
	mlib_d64 sumh, sumv, normh, normv;

	if (!table)
		return (MLIB_FAILURE);
	if (table->shift_vis_affine < 0)
		return (MLIB_FAILURE);

	if (nchan == 1) {
		num_copy = 1;
		ptr_tablex = &(table->dataH_s16_1);
	} else if (nchan == 2) {
		num_copy = 2;
		ptr_tablex = &(table->dataH_s16_3);
	} else if (nchan == 3 || nchan == 4) {
		num_copy = 4;
		ptr_tablex = &(table->dataH_s16_4);
	} else
		return (MLIB_FAILURE);

	if (*ptr_tablex != NULL && table->dataV_s16_1 != NULL)
		return (MLIB_SUCCESS);

	dataH = mlib_ImageGetInterpDoubleDataH(table);
	dataV = mlib_ImageGetInterpDoubleDataV(table);
	if (!dataH || !dataV)
		return (MLIB_FAILURE);

	width = mlib_ImageGetInterpWidth(table);
	height = mlib_ImageGetInterpHeight(table);
	width_bits = mlib_ImageGetInterpWidthBits(table);
	height_bits = mlib_ImageGetInterpHeightBits(table);
	vis_width_bits = table->vis_width_bits;
	vis_height_bits = table->vis_height_bits;
	subsampleBitsH = mlib_ImageGetInterpSubsampleBitsH(table);
	subsampleBitsV = mlib_ImageGetInterpSubsampleBitsV(table);

	if (table->dataV_s16_1 != NULL) {
		if (table->dataH_s16_1 != NULL) {
			tablex_old = table->dataH_s16_1;
			num_copy_old = 1;
		} else if (table->dataH_s16_3 != NULL) {
			tablex_old = table->dataH_s16_3;
			num_copy_old = 3;
		} else {
			tablex_old = table->dataH_s16_4;
			num_copy_old = 4;
		}

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		for (j = 0; j < ((width + 1) & ~1); j++) {
			mlib_s16 *tbl = (mlib_s16 *)tablex + j * num_copy;
			mlib_s16 *tbl_old =
			    (mlib_s16 *)tablex_old + j * num_copy_old;
			for (i = 0; i < (1 << subsampleBitsH); i++) {
				mlib_s16 v =
				    tbl_old[num_copy_old *
				    (i << vis_width_bits)];
				for (c = 0; c < num_copy; c++) {
					tbl[num_copy * (i << vis_width_bits) +
					    c] = v;
				}
			}
		}
		*ptr_tablex = tablex;
		return (MLIB_SUCCESS);
	}

	sumv = 0;
	max = 0;

	for (i = 0; i < (1 << subsampleBitsV); i++) {
		mlib_d64 s = 0;
		mlib_s32 ind = (i << height_bits);

		for (j = 0; j < height; j++) {
			d = mlib_fabs(dataV[j + ind]);
			s += d;
			max = (max > d) ? max : d;
		}
		sumv = (sumv > s) ? sumv : s;
	}

/* all fhkernels = 0 */
	if (sumv == 0) {
		dscale = 0;

/* X table */

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		INIT_TABLE_16(tablex, (1 << subsampleBitsH), width, width_bits,
		    vis_width_bits, dataH);

		if ((dataH == dataV) && num_copy == 4)
			tabley = tablex;
		else {

			num_copy = 4;

			tabley =
			    mlib_malloc(num_copy * (1 << subsampleBitsV) *
			    (1 << vis_height_bits) * sizeof (mlib_s16));
			if (tabley == NULL) {
				mlib_free(tablex);
				return (MLIB_FAILURE);
			}

			INIT_TABLE_16(tabley, (1 << subsampleBitsV), height,
			    height_bits, vis_height_bits, dataV);

			*ptr_tablex = tablex;
			table->dataV_s16_1 = tabley;

/* Store shift */
			table->shift_vis_affine = 43;

			return (MLIB_SUCCESS);
		}
	}

	normv = 32767.0 / (32768.0 * sumv);
	scalev = mlib_ilogb(sumv * normv);
	isum = mlib_ilogb(max * normv);

/* all elements must be in the range -32768, 32767 */
	if (scalev == isum)
		norm_scale_v = 14;
/* but sumv may be in the range -65576, 65575 */
	else
		norm_scale_v = 15;

	min_scale = 25;
	max_scale = 40;

	normh = 32768.0 * sumv / 32767;

	if (dataH != dataV) {
		sumh = 0;
		max = 0;

		for (i = 0; i < (1 << subsampleBitsH); i++) {
			mlib_d64 s = 0;
			mlib_s32 ind = (i << width_bits);

			for (j = 0; j < width; j++) {
				d = mlib_fabs(dataH[j + ind]);
				s += d;
				max = (max > d) ? max : d;
			}
			sumh = (sumh > s) ? sumh : s;
		}
	} else
		sumh = sumv;

	isum = mlib_ilogb(max * normh);
	scaleh = mlib_ilogb(sumh * normh);

/* all elements must be in the range -32768, 32767 */
	if (scaleh == isum)
		norm_scale_h = 14;
/* but sumh may be in the range -65576, 65575 */
	else
		norm_scale_h = 15;

	scale = norm_scale_v + norm_scale_h - (scaleh + scalev);

	if (scale < min_scale) {
		table->shift_vis_affine = -1;
/* koeff. are so large */
		return (MLIB_FAILURE);
	}

	if (scale > max_scale) {
		scaleh += (scale - max_scale + 1) >> 1;
		scalev += (scale - max_scale) >> 1;
		scale = max_scale;
	}
Exemplo n.º 14
0
mlib_status
mlib_m_conv3x3_16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buff2, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9;
	__m64 d0, d1, d2, rr, tmpa, tmpb;
	__m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph;
	__m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	wid4 = (width + 3) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + 2 * wid4;
	buff2 = buff1 + 2 * wid4;

	for (j = 0; j < 2; j++) {
		sp = (__m64 *) sl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < wid4; i++) {
			PREP_3x3(i);
		}

		sl += sll;

		if (j == 0) {
			buffT = buff1;
			buff1 = buff0;
			buff0 = buffT;
		}
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < width / 4; i++) {
			CONV_3x3(i);
			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3(i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Exemplo n.º 15
0
static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
                                     const mlib_image *src,
                                     const mlib_d64   *k,
                                     mlib_s32         n,
                                     mlib_s32         dn,
                                     mlib_s32         cmask)
{
  FTYPE    buff[BUFF_SIZE];
  mlib_s32 off, kh;
  mlib_s32 d0, d1;
  const FTYPE    *pk;
  FTYPE    k0, k1, k2, k3;
  FTYPE    p0, p1, p2, p3, p4;
  DEF_VARS(DTYPE);
  DTYPE    *sl_c, *dl_c, *sl0;
  mlib_s32 l, hsize, max_hsize;
  GET_SRC_DST_PARAMETERS(DTYPE);

  hgt -= (n - 1);
  adr_dst += dn*dll;

  max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;

  if (!max_hsize) max_hsize = 1;

  if (max_hsize > BUFF_SIZE) {
    pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
  }

  chan1 = nchannel;

  sl_c = adr_src;
  dl_c = adr_dst;

  for (l = 0; l < hgt; l += hsize) {
    hsize = hgt - l;

    if (hsize > max_hsize) hsize = max_hsize;

    for (c = 0; c < nchannel; c++) {
      if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

      sl = sl_c + c;
      dl = dl_c + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (j = 0; j < hsize; j++) pbuff[j] = 0.0;

      for (i = 0; i < wid; i++) {
        sl0 = sl;

        for (off = 0; off < (n - 4); off += 4) {
          pk = k + off;
          sp = sl0;

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
            pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

            sp += 2*sll;
          }

          sl0 += 4*sll;
        }

        pk = k + off;
        sp = sl0;

        k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
        p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];

        dp = dl;
        kh = n - off;

        if (kh == 4) {
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 3) {
          sp += 2*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3;
            p2 = sp[0];
            p3 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3;
            p2 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 2) {
          sp += sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2;
            p1 = sp[0];
            p2 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2;
            p1 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else /* if (kh == 1) */ {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j++) {
            p0 = sp[0];

            d0 = D2I(p0*k0 + pbuff[j]);

            dp[0] = FROM_S32(d0);

            pbuff[j] = 0;

            sp += sll;
            dp += dll;
          }
        }

        sl += chan1;
        dl += chan1;
      }
    }

    sl_c += max_hsize*sll;
    dl_c += max_hsize*dll;
  }

  if (pbuff != buff) mlib_free(pbuff);

  return MLIB_SUCCESS;
}
Exemplo n.º 16
0
mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
                           const mlib_image *src,
                           const mlib_s32   *kernel,
                           mlib_s32         m,
                           mlib_s32         n,
                           mlib_s32         dm,
                           mlib_s32         dn,
                           mlib_s32         scale,
                           mlib_s32         cmask)
{
  FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
  FTYPE    **buffs = buffs_arr, *buffd;
  FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
  mlib_s32 mn, l, off, kw, bsize, buff_ind;
  mlib_s32 d0, d1;
  FTYPE    k0, k1, k2, k3, k4, k5, k6;
  FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
  d64_2x32 dd;
  DEF_VARS(DTYPE);
  mlib_s32 chan2;
  mlib_s32 *buffo, *buffi;
  mlib_status status = MLIB_SUCCESS;

  GET_SRC_DST_PARAMETERS(DTYPE);

  if (scale > 30) {
    fscale *= 1.0/(1 << 30);
    scale -= 30;
  }

  fscale /= (1 << scale);

  mn = m*n;

  if (mn > 256) {
    k = mlib_malloc(mn*sizeof(mlib_d64));

    if (k == NULL) return MLIB_FAILURE;
  }

  for (i = 0; i < mn; i++) {
    k[i] = kernel[i]*fscale;
  }

  if (m == 1) {
    status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
    FREE_AND_RETURN_STATUS;
  }

  bsize = (n + 3)*wid;

  if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
    pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));

    if (pbuff == NULL) {
      status = MLIB_FAILURE;
      FREE_AND_RETURN_STATUS;
    }
    buffs = (FTYPE   **)(pbuff + bsize);
  }

  for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
  for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
  buffd = buffs[n] + wid;
  buffo = (mlib_s32*)(buffd + wid);
  buffi = buffo + (wid &~ 1);

  chan1 = nchannel;
  chan2 = chan1 + chan1;

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*nchannel;

  for (c = 0; c < nchannel; c++) {
    if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

    for (l = 0; l < n; l++) {
      FTYPE    *buff = buffs[l];

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (i = 0; i < wid + (m - 1); i++) {
        buff[i] = (FTYPE)sl[i*chan1];
      }

      sl += sll;
    }

    buff_ind = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (i = 0; i < wid; i++) buffd[i] = 0.0;

    for (j = 0; j < hgt; j++) {
      FTYPE    **buffc = buffs + buff_ind;
      FTYPE    *buffn = buffc[n];
      FTYPE    *pk = k;

      for (l = 0; l < n; l++) {
        FTYPE    *buff_l = buffc[l];

        for (off = 0; off < m;) {
          FTYPE    *buff = buff_l + off;

          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;
          off += kw;

          sp = sl;
          dp = dl;

          p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
          p5 = buff[3]; p6 = buff[4]; p7 = buff[5];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
          pk += kw;

          if (kw == 7) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;

                p6 = buff[i + 6]; p7 = buff[i + 7];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;

                p6 = buff[i + 6]; p7 = buff[i + 7];

                LOAD_BUFF(buffi);

                dd.d64 = *(FTYPE   *)(buffi + i);
                buffn[i    ] = (FTYPE)dd.i32s.i0;
                buffn[i + 1] = (FTYPE)dd.i32s.i1;

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = buff[i + 5]; p6 = buff[i + 6];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = buff[i + 5]; p6 = buff[i + 6];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = buff[i + 4]; p5 = buff[i + 5];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = buff[i + 4]; p5 = buff[i + 5];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = buff[i + 3]; p4 = buff[i + 4];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = buff[i + 3]; p4 = buff[i + 4];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = buff[i + 2]; p3 = buff[i + 3];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = buff[i + 2]; p3 = buff[i + 3];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else /*if (kw == 2)*/ {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = buff[i + 1]; p2 = buff[i + 2];

                buffd[i    ] += p0*k0 + p1*k1;
                buffd[i + 1] += p1*k0 + p2*k1;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = buff[i + 1]; p2 = buff[i + 2];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */
      for (; i < wid; i++) {
        FTYPE    *pk = k, s = 0;
        mlib_s32 x, d0;

        for (l = 0; l < n; l++) {
          FTYPE    *buff = buffc[l] + i;

          for (x = 0; x < m; x++) s += buff[x] * (*pk++);
        }

        d0 = D2I(s);
        dp[0] = FROM_S32(d0);

        buffn[i] = (FTYPE)sp[0];

        sp += chan1;
        dp += chan1;
      }

      for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];

      /* next line */
      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= n + 1) buff_ind = 0;
    }
  }

  FREE_AND_RETURN_STATUS;
}
Exemplo n.º 17
0
mlib_image *mlib_ImageCreate(mlib_type type,
                             mlib_s32  channels,
                             mlib_s32  width,
                             mlib_s32  height)
{
  mlib_image *image;
  mlib_s32        wb;                /* width in bytes */
  void       *data;

/* sanity check */
  if (width <= 0 || height <= 0 || channels < 1 || channels > 4) {
    return NULL;
  };

  switch (type) {
    case MLIB_DOUBLE:
      wb = width * channels * 8;
      break;
    case MLIB_FLOAT:
    case MLIB_INT:
      wb = width * channels * 4;
      break;
    case MLIB_USHORT:
    case MLIB_SHORT:
      wb = width * channels * 2;
      break;
    case MLIB_BYTE:
      wb = width * channels;
      break;
    case MLIB_BIT:
      wb = (width * channels + 7) / 8;
      break;
    default:
      return NULL;
  }

  data = mlib_malloc(wb * height);
  if (data == NULL) {
    return NULL;
  }

  image = (mlib_image *)mlib_malloc(sizeof(mlib_image));
  if (image == NULL) {
    mlib_free(data);
    return NULL;
  };

  image -> type     = type;
  image -> channels = channels;
  image -> width    = width;
  image -> height   = height;
  image -> stride   = wb;
  image -> data     = data;
  image -> flags    = ((width & 0xf) << 8);        /* set width field */
  image -> flags   |= ((height & 0xf) << 12);      /* set height field */
  image -> flags   |= ((wb & 0xf) << 16);          /* set stride field */
  image -> flags   |= (mlib_addr)data & 0xff;
  image -> format   = MLIB_FORMAT_UNKNOWN;

  image -> paddings[0] = 0;
  image -> paddings[1] = 0;
  image -> paddings[2] = 0;
  image -> paddings[3] = 0;

  image -> bitoffset = 0;

  if ((type == MLIB_BIT) && (wb * 8 != width * channels)) {
    image -> flags |= MLIB_IMAGE_ONEDVECTOR;       /* not 1-d vector */
  }

  image -> flags &= MLIB_IMAGE_ATTRIBUTESET;
  image -> state  = NULL;

  return image;
}
Exemplo n.º 18
0
/* *********************************************************** */
mlib_status
mlib_m_sconv3x3_8nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	dl += dll + 1;

	wid4 = (width + 7) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + wid4;

	for (j = 0; j < 2; j++) {
		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_3x3_1ch(lo, i);
		}

		sl += sll;

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < width / 8; i++) {
			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_hi, res_lo);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_hi, res_lo);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageMinFilter3x3_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *buff, *buff1;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - 2) * SSIZE;
	wid8 = (wid + 7) & ~7;
	buff = mlib_malloc(2 * wid8);
	buff1 = buff + wid8;

	sl = (mlib_u8 *)src;
/* dst ptrs skip top j and left col */
	dl = (mlib_u8 *)dst + dlb + SSIZE;

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	sp0 = buff;
	sp1 = buff1;
	sp2 = sl;
	sp3 = sp2 + slb;
	sl += 2 * slb;

	for (i = 0; i < wid; i += 8) {
		c0 = *(__m64 *) sp2;
		c1 = *(__m64 *) (sp2 + SSIZE);
		c2 = *(__m64 *) (sp2 + 2 * SSIZE);
		d0 = *(__m64 *) sp3;
		d1 = *(__m64 *) (sp3 + SSIZE);
		d2 = *(__m64 *) (sp3 + 2 * SSIZE);

		cc = C_COMP(c0, c1);
		dd = C_COMP(d0, d1);
		cc = C_COMP(cc, c2);
		dd = C_COMP(dd, d2);

		*(__m64 *) sp0 = cc;
		*(__m64 *) sp1 = dd;

		sp0 += 8;
		sp1 += 8;
		sp2 += 8;
		sp3 += 8;
	}

	for (j = 0; j <= (hgt - 2 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;
		sp3 = sp2 + slb;

/*
 *    line0:     aa
 *    line1:     bb
 *    line2:  c0 c1 c2
 *    line3:  d0 d1 d2
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - 3)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			cc = C_COMP(cc, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			c1 = C_COMP(c0, c1);
			cc = C_COMP(c1, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(buff);

	return (MLIB_SUCCESS);
}