mlib_status
__mlib_VideoInterpAveX_U8_U8_8x16(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_s32 y;
    mlib_d64 *dd, ss0[16], *sp1, *sp2, s1hi, s1lo, s2hi, s2lo, s2;
    mlib_d64 mthree = vis_fone();
    mlib_f32 fzero = vis_fzeros();
    mlib_f32 fexpd2 = vis_to_float(0x1000200);

    mthree = vis_fpadd16(mthree, vis_fpadd16(mthree, mthree));

    dd = (mlib_d64 *)curr_block;

    sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0);

#pragma pipeloop(0)
    MLIB_V_VIDEOCOPY8(16);

    vis_write_gsr((5 << 3) + ((mlib_s32)(ref_block + 1) & 7));
    sp2 = (mlib_d64 *)((mlib_addr)(ref_block + 1) & ~7);

#pragma pipeloop(0)
    MLIB_V_VIDEOINTERPAVG8(16);

    return (MLIB_SUCCESS);
}
Exemplo n.º 2
0
static inline void vis_init_consts(void)
{
	vis_set_gsr(7 << VIS_GSR_SCALEFACT_SHIFT);

	vis_ld64(const_2048[0], CONST_2048);
	vis_ld64(const_1024[0], CONST_1024);
	vis_ld64(const_Ugreen[0], CONST_UGREEN);
	vis_ld64(const_Vgreen[0], CONST_VGREEN);
	vis_fzeros(ZEROS);
	vis_ld64(const_Ublue_Vred[0], CONST_UBLUE);
	vis_ld32(const_Ycoeff[0], CONST_YCOEFF);
	vis_ld64(const_128[0],  CONST_128);
}
Exemplo n.º 3
0
mlib_status
__mlib_VideoAddBlock_U8_S16(
	mlib_u8 *curr_block,
	const mlib_s16 *mc_block,
	mlib_s32 stride)
{
	mlib_s32 y;
	mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd;
	mlib_f32 zeros = vis_fzeros();

/*
 *   mlib_s32 mlib_imult = 0x100;
 *   mlib_f32 mult  = *(mlib_f32*) & mlib_imult;
 */
	mlib_f32 mult = vis_to_float(0x100);

	vis_write_gsr(7 << 3);

	dp = (mlib_d64 *)curr_block;
	sp = (mlib_d64 *)mc_block;

#pragma pipeloop(0)
	for (y = 0; y < 8; y++) {

		dd = *dp;
		s1hi = (*sp++);
		s1lo = (*sp++);
		s2hi = vis_fpmerge(zeros, vis_read_hi(dd));
		s2lo = vis_fmul8x16al(vis_read_lo(dd), mult);

		s1hi = vis_fpadd16(s1hi, s2hi);
		s1lo = vis_fpadd16(s1lo, s2lo);

		*dp = vis_fpack16_pair(s1hi, s1lo);
		dp = (mlib_d64 *)((mlib_u8 *)dp + stride);
	}

	return (MLIB_SUCCESS);
}
Exemplo n.º 4
0
DEF_FUNC(mlib_ImageBlendColor_U8, mlib_u8,
    mlib_s32)
{
	mlib_f32 fzeros = vis_fzeros();
	mlib_f32 fmax = vis_to_float(0xFFFFFFFF);
	mlib_d64 dmask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 done = vis_to_double_dup(0x01000100);
	mlib_d64 *buffs, *buffd;
	mlib_d64 *sp, *dp;
	mlib_f32 *alp_tbl;
	mlib_d64 ss, s1, rr, tt, d0, d1;
	mlib_d64 cc, c0, c1, c2;
	mlib_d64 amask0, amask1, amask2;
	mlib_s32 ww, dflag, i, j;

	vis_write_gsr(7 << 3);

	width *= channel;
	ww = (width + 7) / 8;

	if (channel == 3) {
		ww = 3 * ((ww + 2) / 3);
	}

	buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww);

	if (buffs == NULL) {
		return (MLIB_FAILURE);
	}

	buffd = buffs + ww;

	if (channel == 4) {
		cc = DOUBLE_4U16(color[0], color[1], color[2], color[3]);
		cc = vis_fand(vis_for(cc,
		    ((mlib_d64 *)mlib_dmask_arr)[8 >> alpha]), dmask);
		alp_tbl = (mlib_f32 *)mlib_alp_tbl + alpha * 256;
	} else if (channel == 3) {
Exemplo n.º 5
0
mlib_status
__mlib_VectorConvert_S16_S8_Mod(
	mlib_s16 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_s8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 fzero = vis_fzeros();
	mlib_s32 len_64, even_length, rest_64, length = n, off;
	mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7;

	if (length < 16) {
		EXPAND(mlib_s8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];
	off = (mlib_addr)src & 7;

	if (!off) {

/*
 * Both vectors are 64-bit aligned.
 */

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2),
				fzero), four_16_ones);
		}
	} else {

/*
 * Source vector is not 64-bit aligned.
 * Peeling of 1 iteration. Then loop with step==2.
 */

		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd4 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i <= len_64; i += 2) {
			dd0 = dd2;
			dd1 = vis_ld_d64_nf(ddsrc + i);
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_bshuffle(dd0, dd1);
			dd6 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			dd7 = vis_faligndata(dd6, dd6);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd6, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd7, four_16_ones);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
Exemplo n.º 6
0
mlib_status
__mlib_VectorConvert_S16_U8_Mod(
	mlib_s16 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_u8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_s32 len_64, even_length, rest_64, length = n;
	mlib_f32 fzero = vis_fzeros();
	mlib_d64 dd1, dd2, dd3, dd4;
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		EXPAND(mlib_u8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];

	if (!((mlib_addr)src & 7)) {

/*
 * Both vectors are 64-bit aligned. We can process without
 * vis_faligndata
 * Peeling the 1 iteration. Then loop with step==2.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
		}
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2));
		}
	} else {

/*
 * Source vector is not 64-bit aligned. Use vis_faligndata.
 * Peeling the 1 iteration. Then loop with step==2.
 */

		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd3 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
		}
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i <= len_64; i += 2) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i);
			dd3 = vis_faligndata(dd1, dd2);
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4));
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
Exemplo n.º 10
0
mlib_status
mlib_ImageMulAlpha_U8(
    mlib_u8 *sl,
    mlib_u8 *dl,
    mlib_s32 sstride,
    mlib_s32 dstride,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 channel,
    mlib_s32 alpha)
{
	mlib_f32 fzeros = vis_fzeros();
	mlib_d64 dmask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 done = vis_to_double_dup(0x01000100);
	mlib_d64 *buffs, *buffd;
	mlib_d64 *sp, *dp;
	mlib_d64 ss, s1, rr, d0, d1;
	mlib_d64 amask0, amask1, amask2;
	mlib_s32 ww, dflag, cmask, i, j;

	vis_write_gsr(7 << 3);

	width *= channel;
	ww = (width + 7) / 8;

	if (channel == 3) {
		ww = 3 * ((ww + 2) / 3);
	}

	buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww);

	if (buffs == NULL) {
		return (MLIB_FAILURE);
	}

	buffd = buffs + ww;

	if (channel == 4) {
		cmask = 1 << (3 - alpha);
		cmask |= (cmask << 4);
	} else if (channel == 3) {
		amask0 = ((mlib_d64 *)mlib_amask3_arr)[alpha];
		amask1 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 1];
		amask2 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 2];
	}

	for (j = 0; j < height; j++) {
		if (((int)sl & 7)) {
			MEM_COPY(sl, buffs, width);
			sp = buffs;
		} else {
			sp = (mlib_d64 *)sl;
		}

		dflag = 0;

		if (((int)dl | width) & 7) {
			dp = buffd;
			dflag = 1;
		} else {
			dp = (mlib_d64 *)dl;
		}

		if (channel == 4) {
			mlib_d64 a0, a1;

			if (alpha == 0) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(hi, au);
				}

			} else if (alpha == 1) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(hi, al);
				}

			} else if (alpha == 2) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(lo, au);
				}

			} else {	/* if (alpha == 3) */

#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(lo, al);
				}
			}

		} else if (channel == 3) {
			mlib_d64 s0, s1, s2;
			mlib_d64 a0, a1, a2;
			mlib_s32 cmask0, cmask1, cmask2;

			cmask0 = 0x492 >> alpha;
			cmask1 = 0x492 >> (alpha + 1);
			cmask2 = 0x492 >> (alpha + 2);

			if (alpha == 0) {
				vis_alignaddr((void *)0, 7);
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_0();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_0_NF();
					MUL_ALPHA_3CH();
				}

			} else if (alpha == 1) {
				mlib_d64 b0, b1, b2;

#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_1();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_1_NF();
					MUL_ALPHA_3CH();
				}

			} else {	/* if (alpha == 2) */

				vis_alignaddr((void *)0, 1);
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_2();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_2_NF();
					MUL_ALPHA_3CH();
				}

			}

		} else {	/* if (channel == 2) */

			if (alpha == 0) {
Exemplo n.º 11
0
mlib_status
__mlib_MatrixMul_S16_S8_Mod(
	mlib_s16 *z,
	const STYPE * x,
	const STYPE * y,
	mlib_s32 m,
	mlib_s32 l,
	mlib_s32 n)
{
	mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y;
	mlib_d64 array[MAX_SIZE];
	mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3;
	mlib_s32 size, i, j, k, l8;

	if (!((m > 0) && (l > 0) && (n > 0))) {
		return (MLIB_FAILURE);
	}

	l8 = (l + 7) / 8;
	size = l8 * n + 2 * l8 + 4;

	if (size <= MAX_SIZE) {
		buff_y = array;
	} else {
		buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64));

		if (buff_y == NULL) {
			return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat,
				x, y, m, l, n, n, z);
		}
	}

	buff_x = buff_y + l8 * n;
	pbuff_y = buff_y;

/* transpose y matrix */
	for (i = 0; i < n; i++) {
		mlib_u8 *py = (mlib_u8 *)y + i;
		mlib_u8 *pp = (mlib_u8 *)pbuff_y;

		for (j = 0; j <= (l - 4); j += 4) {
			((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080;
			((mlib_s16 *)pp)[1] =
				((py[2 * n] << 8) | py[3 * n]) ^ 0x8080;
			py += 4 * n;
			pp += 4;
		}

		for (; j < l; j++) {
			(*pp++) = *py ^ 0x80;
			py += n;
		}

		for (; j < 8 * l8; j++) {
			(*pp++) = 0;
		}

		pbuff_y += l8;
	}

	for (j = 0; j < m; j++) {
		mlib_s32 x_sum = 0;

		for (i = 0; i < l; i++) {
			x_sum += x[i];
		}

		x_sum <<= 7;

		pbuff_x = buff_x;
		pbuff_y = buff_y;

/* copy x line */
		px = vis_alignaddr((void *)x, 0);
		x1 = vis_ld_d64_nf(px);
		px++;
		xx = 0;
		for (i = 0; i < l8; i++) {
			x0 = x1;
			x1 = vis_ld_d64_nf(px);
			px++;
			xx = vis_faligndata(x0, x1);
			pbuff_x[2 * i] =
				vis_fpmerge(vis_read_hi(xx), vis_fzeros());
			pbuff_x[2 * i + 1] =
				vis_fpmerge(vis_read_lo(xx), vis_fzeros());
		}

/* loop on y lines */
		for (i = 0; i < n; i += 2) {
			mlib_d64 *px = pbuff_x;
			mlib_d64 *py0 = pbuff_y;
			mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0;

			ds0 = ds1 = vis_fzero();

			LOAD;
			MUL;
			LOAD;

#pragma pipeloop(0)
			for (k = 0; k < l8; k++) {
				SUM;
				MUL;
				LOAD;
			}

			ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0),
				vis_read_lo(ds0)),
				vis_fpadd16s(vis_read_hi(ds1),
				vis_read_lo(ds1)));

			z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] -
				x_sum;

			if (i + 1 < n) {
				z[i + 1] =
					((mlib_s16 *)&ds0)[2] +
					((mlib_s16 *)&ds0)[3] - x_sum;
			}

			pbuff_y += 2 * l8;
		}

		z += n;
		x += l;
	}

	if (size > MAX_SIZE) {
		__mlib_free(buff_y);
	}

	return (MLIB_SUCCESS);
}
Exemplo n.º 12
0
mlib_status
__mlib_VectorNorm_U8_Sat(
	mlib_d64 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x;
	mlib_d64 *dpx, *dpxend;
	mlib_d64 sum = 0.0;
	mlib_d64 dx, dr1, dr2, dr3, dr4, dr5, dr6;
	mlib_d64 ds, ds1;
	mlib_d64 edge[2];
	mlib_f32 fone = vis_to_float(0x100);
	mlib_f32 fzero = vis_fzeros();
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	pxend = px + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	while ((mlib_addr)dpx < (mlib_addr)dpxend) {
		d_left = dpxend - dpx;

		if (d_left > MAX_LOOP)
			d_left = MAX_LOOP;
		ds = ds1 = 0.0;
		for (; d_left > 0; d_left--) {
			NORM_U8;
			SUM_U8;
			dpx++;
			dx = dpx[0];
		}

		ds = vis_fpadd32(ds, ds1);
		fsum = vis_fpadd32s(vis_read_hi(ds), vis_read_lo(ds));
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		NORM_U8;
		ds = vis_fpadd32(dr3, dr4);
		ds1 = vis_fpadd32(dr5, dr6);
		ds = vis_fpadd32(ds, ds1);
		fsum = vis_fpadd32s(vis_read_hi(ds), vis_read_lo(ds));
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = mlib_sqrt(sum);
	return (MLIB_SUCCESS);
#undef MAX_LOOP
}
Exemplo n.º 13
0
mlib_status
__mlib_VectorConvert_U8_S8_Sat(
	mlib_u8 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i, off;
	mlib_s8 c;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 zero = vis_fzeros();

	if (length < 16) {
		PACK_S_U(mlib_s8, mlib_u8);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) < 0 ? 0 : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		off = (mlib_addr)src & 7;
		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}

/*
 * Then loop with step==2.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8sux16(d6, four_16_ones);
			d6 = vis_faligndata(d6, d6);
			d5 = vis_fmul8sux16(d6, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c;

	return (MLIB_SUCCESS);
}
Exemplo n.º 14
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 fzero = vis_fzeros();
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_u8 c;
	mlib_d64 dsp = vis_to_double_dup(0x800080);
	mlib_d64 rst = vis_to_double_dup(0x80808080);
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d4 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			d2 = vis_fpmerge(fzero, vis_read_hi(d4));
			d3 = vis_fmul8x16al(vis_read_lo(d4), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
			(*ddst++) = vis_fxor(d4, rst);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d1 = vis_faligndata(d1, d2);
			d3 = vis_fmul8x16al(vis_read_hi(d1), fm);
			d4 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpadd16(dsp, d4);
			d1 = vis_fpack16_pair(d3, d4);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_faligndata(d1, d2);
			d4 = vis_fmul8x16al(vis_read_hi(d3), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d3), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d3 = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8x16al(vis_read_hi(d6), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d6), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d6 = vis_fpack16_pair(d4, d5);
			(*ddst++) = vis_fxor(d3, rst);
			(*ddst++) = vis_fxor(d6, rst);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c;

	return (MLIB_SUCCESS);
}
mlib_status FUNC(
    MxN) (
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 **dmask,
    mlib_s32 m,
    mlib_s32 n,
    mlib_s32 scale,
    const void *colormap)
{
	mlib_type stype, dtype;
	const mlib_s32 *dmask0 = dmask[0], *dmask1 = dmask[1], *dmask2 =
	    dmask[2];
	mlib_s32 method = mlib_ImageGetMethod(colormap);
	mlib_u8 *sl, *dl;
	mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh, num_blk;
	mlib_s32 off, off1, kw, mstep, line_size, kern_size, xsize8, i, j, k;
	mlib_d64 *pbuff;
	mlib_u8 *p_dim;
	mlib_s16 *kern, *pkern;
	mlib_d64 *dkern;
	mlib_d64 dscale, dscale0, dscale1, dscale2;
	mlib_d64 ss, d0, d1;
	mlib_f32 fzeros = vis_fzeros();
	mlib_s32 step0, half_step0, v0;
	mlib_s32 bit_offset = mlib_ImageGetBitOffset(dst);
	mlib_u8 *p_lut;

	MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl);
	MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl);

	p_lut = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap);
	step0 = abs(p_lut[1] - p_lut[0]);

	num_blk = (sw + (m - 1)) / m;
	mstep = m * NCHAN;
	line_size = (mstep * num_blk + 7) & ~7;
	xsize8 = (NCHAN * sw + 7) / 8;

	dscale = 1.0;
	while (scale > 30) {
		dscale *= 1.0 / (1 << 30);
		scale -= 30;
	}

	dscale /= (1 << scale);

	dscale0 = dscale * step0;
	half_step0 = (step0 - 1) >> 1;

	kern_size = n * line_size;
	kern = __mlib_malloc(kern_size * sizeof (mlib_s16));

	if (kern == NULL)
		return (MLIB_FAILURE);

	for (j = 0; j < n; j++) {
		for (i = 0; i < m; i++) {
			pkern = kern + j * line_size + i;
			v0 = half_step0 - (mlib_s32)(dmask0[j * m +
			    i] * dscale0);
			for (k = 0; k < num_blk; k++) {
				pkern[k * mstep] = v0;
			}
		}
	}

	pbuff = __mlib_malloc(xsize8 * sizeof (mlib_d64) + 16);

	if (pbuff == NULL) {
		__mlib_free(kern);
		return (MLIB_FAILURE);
	}

	pkern = kern;

	vis_write_gsr(7 << 3);

	for (j = 0; j < sh; j++) {
		dkern = (mlib_d64 *)pkern;

		if ((mlib_s32)sl & 7) {
			mlib_u8 *sp = sl;

#pragma pipeloop(0)
			for (i = 0; i < xsize8; i++) {
				LOAD_NA_NF(ss, sp);
				d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_hi(ss)), dkern[2 * i]);
				d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_lo(ss)), dkern[2 * i + 1]);
				pbuff[i] = vis_fpack16_pair(d0, d1);
				sp += 8;
			}

		} else {
			mlib_d64 *sp = (mlib_d64 *)sl;

#pragma pipeloop(0)
			for (i = 0; i < xsize8; i++) {
				ss = sp[i];
				d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_hi(ss)), dkern[2 * i]);
				d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_lo(ss)), dkern[2 * i + 1]);
				pbuff[i] = vis_fpack16_pair(d0, d1);
			}
		}

		pkern += line_size;

		if (pkern >= kern + kern_size)
			pkern = kern;

		mlib_ImageColorTrue2IndexLine_U8_BIT_1((mlib_u8 *)pbuff, dl,
		    bit_offset, sw, colormap);

		sl += sll;
		dl += dll;
	}

	__mlib_free(pbuff);
	__mlib_free(kern);

	return (MLIB_SUCCESS);
}