Example #1
0
void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, dmask, dFF;
    mlib_s32 i, i0, j, x, mask;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    dmask = vis_to_double_dup(0xFFFFFF);
    dFF = vis_to_double_dup(0xFFFFFFFF);

    for (j = 0; j < height; j++) {
	mlib_s32 *src = srcBase;
	mlib_s32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    mlib_u8 *pp0 = (mlib_u8*)(src + i);
	    mlib_u8 *pp1 = (mlib_u8*)(src + i + 1);
	    dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1);
	    dd = vis_fand(dd, dmask);
#if 1
	    mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3);
	    *(mlib_d64*)(dst + i) = dd;
	    vis_pst_8(dFF, dst + i, mask);
#else
	    mask = ((*pp0 & 1) << 1) | (*pp1 & 1);
	    dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]);
	    *(mlib_d64*)(dst + i) = dd;
#endif
	}

	if (i < width) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	}
Example #2
0
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, mask;
    mlib_s32 i, i0, j;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    mask = vis_to_double_dup(0xFF000000);
    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
	mlib_u32 *src = srcBase;
	mlib_u32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]);
	    dd = vis_faligndata(dd, dd);
	    *(mlib_d64*)(dst + i) = vis_for(dd, mask);
	}

	if (i < width) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	}
mlib_status
__mlib_VideoColorMerge3_S16(
	mlib_s16 *colors,
	const mlib_s16 *color1,
	const mlib_s16 *color2,
	const mlib_s16 *color3,
	mlib_s32 n)
{
	mlib_d64 *dp = (mlib_d64 *)colors;
	mlib_d64 *sp0 = (mlib_d64 *)color1;
	mlib_d64 *sp1 = (mlib_d64 *)color2;
	mlib_d64 *sp2 = (mlib_d64 *)color3;
	mlib_d64 sd0, sd1, sd2, sd3, sd4, sd5;
	mlib_d64 dd0, dd1, dd2, dd3, dd4, dd5;
	mlib_s32 i;

#pragma pipeloop(1)
	for (i = 0; i <= (n - 8); i += 8) {
		sd0 = sp0[0];
		sd1 = sp1[0];
		sd2 = sp2[0];
		sd3 = sp0[1];
		sd4 = sp1[1];
		sd5 = sp2[1];
		vis_write_bmask(0x018923ab, 0);
		dd0 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(sd1, sd2);
		dd2 = vis_bshuffle(sd3, sd4);
		dd5 = vis_bshuffle(sd4, sd5);
		vis_write_bmask(0x45cd67ef, 0);
		dd1 = vis_bshuffle(sd0, sd1);
		dd4 = vis_bshuffle(sd3, sd4);
		vis_write_bmask(0x01238945, 0);
		dp[0] = vis_bshuffle(dd0, sd2);
		dp[3] = vis_bshuffle(dd2, sd5);
		dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1));
		dp[4] = vis_freg_pair(vis_read_lo(dd5), vis_read_hi(dd4));
		vis_write_bmask(0xcd4567ef, 0);
		dp[2] = vis_bshuffle(dd1, sd2);
		dp[5] = vis_bshuffle(dd4, sd5);
		sp0 += 2;
		sp1 += 2;
		sp2 += 2;
		dp += 6;
	}

	if (i <= (n - 4)) {
		sd0 = sp0[0];
		sd1 = sp1[0];
		sd2 = sp2[0];
		vis_write_bmask(0x018923ab, 0);
		dd0 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(sd1, sd2);
		vis_write_bmask(0x45cd67ef, 0);
		dd1 = vis_bshuffle(sd0, sd1);
		vis_write_bmask(0x01238945, 0);
		dp[0] = vis_bshuffle(dd0, sd2);
		dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1));
		vis_write_bmask(0xcd4567ef, 0);
		dp[2] = vis_bshuffle(dd1, sd2);
		sp0++;
		sp1++;
		sp2++;
		dp += 3;
	}

	for (; i < n; i++) {
		colors[3 * i] = ((mlib_u16 *)color1)[i];
		colors[3 * i + 1] = ((mlib_u16 *)color2)[i];
		colors[3 * i + 2] = ((mlib_u16 *)color3)[i];
	}

	return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
Example #7
0
mlib_status
__mlib_VectorMerge_S32C_S32(
	mlib_s32 *z,
	const mlib_s32 *rr,
	const mlib_s32 *ii,
	mlib_s32 n)
{
/* pointer to real source vector */
	mlib_u32 *src_r = (mlib_u32 *)rr;

/* pointer to imaginary source vector */
	mlib_u32 *src_i = (mlib_u32 *)ii;

/* pointer to resultant complex vector */
	mlib_u32 *dst = (mlib_u32 *)z;
	mlib_s32 len = n + n, odd = 0, i;
	mlib_d64 d1, d2, d3, d4;

	vis_write_bmask(0x012389ab, 0);

	if (n < 4) {
		MERGE(mlib_s32,
			rr,
			ii,
			n,
			z);
	}

	if (((mlib_addr)dst) & 7) {
		(*dst++) = (*src_r++);
		len--;
		odd = 1;
	}

	if (!(((mlib_addr)src_r ^ (mlib_addr)src_i) & 7)) {

		if (odd) {

			if (((mlib_addr)src_i & 7) && (len >= 2)) {
				(*dst++) = (*src_i++);
				(*dst++) = (*src_r++);
				len -= 2;
			}
#pragma pipeloop(0)
			for (i = 0; i <= (len - 4); i += 4) {
				d1 = *((mlib_d64 *)src_i);
				src_i += 2;
				d2 = *((mlib_d64 *)src_r);
				src_r += 2;
				MERGE32;
				((mlib_d64 *)dst)[0] = d3;
				((mlib_d64 *)dst)[1] = d4;
				dst += 4;
			}

			if (i <= len - 2) {
				(*dst++) = (*src_i++);
				(*dst++) = (*src_r++);
			}

			(*dst++) = (*src_i++);
		} else {
			if ((mlib_addr)src_i & 7) {
				(*dst++) = (*src_r++);
				(*dst++) = (*src_i++);
				len -= 2;
			}
#pragma pipeloop(0)
			for (i = 0; i <= (len - 4); i += 4) {
				d1 = *((mlib_d64 *)src_r);
				src_r += 2;
				d2 = *((mlib_d64 *)src_i);
				src_i += 2;
				MERGE32;
				((mlib_d64 *)dst)[0] = d3;
				((mlib_d64 *)dst)[1] = d4;
				dst += 4;
			}

			if (i <= len - 2) {
				(*dst++) = (*src_r++);
				(*dst++) = (*src_i++);
			}
		}
	} else {
		mlib_f32 fsrc_r, fsrc_i;

		if (odd) {
#pragma pipeloop(0)
			for (i = 0; i <= (len - 2); i += 2) {
				fsrc_r = *((mlib_f32 *)src_r);
				src_r++;
				fsrc_i = *((mlib_f32 *)src_i);
				src_i++;
				d1 = vis_freg_pair(fsrc_i, fsrc_r);
				((mlib_d64 *)dst)[0] = d1;
				dst += 2;
			}

			(*dst++) = (*src_i++);
		} else {
			mlib_f32 fsrc_r, fsrc_i;

#pragma pipeloop(0)
			for (i = 0; i < len; i += 2) {
				fsrc_r = *((mlib_f32 *)src_r);
				src_r++;
				fsrc_i = *((mlib_f32 *)src_i);
				src_i++;
				d1 = vis_freg_pair(fsrc_r, fsrc_i);
				((mlib_d64 *)dst)[0] = d1;
				dst += 2;
			}
		}
	}

	return ((n > 0) ? MLIB_SUCCESS : MLIB_FAILURE);
}
mlib_status
mlib_v_conv3x3_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2, d21, d22;

/* data */
	mlib_d64 d3, d_3, d23;
	mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1, sd00;
	mlib_s32 emask, cmask1;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	cmask = ((cmask & 0xf) << 4) + (cmask & 0xf);
	cmask = (cmask << 8) + (cmask);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 2;
	dw *= 4;
	dh -= 2;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	d_a = adr_dst + dlb + 4;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf3, sa2);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(8);

		vis_alignaddr(s1, 4);
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k3k4);
			CONV_AU(d3, k7k8);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			d23 = vis_faligndata(d3, d_3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k5k6);
			CONV_AL(d23, k7k8);
			CONV_AU(d_1, k3k4);
			CONV_AL(d_2, k5k6);
			CONV_AU(d_3, k9k9);
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = dbuf;
/* prepare the destination addresses */
		dp = (mlib_d64 *)((mlib_addr)da & (~7));
		i = (mlib_addr)dp - (mlib_addr)da;
		cmask1 = cmask >> (-i);
		ddst = vis_alignaddr(ddst, i);
/* generate edge mask for the start point */
		emask = vis_edge8(da, dend);
		sd1 = ddst[0];

		if (emask != 0xff) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd0 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd0, dp++, emask & cmask1);
			ddst++;
			i += 8;
		}
#pragma pipeloop(0)
		for (; i <= (dw - 8); i += 8) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd00 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd00, dp++, cmask1);
			ddst++;
		}

		if (i < dw) {
			sd0 = vis_faligndata(sd1, ddst[1]);
			emask = vis_edge8(dp, dend);
			vis_pst_8(sd0, dp, emask & cmask1);
		}

		sa2 = sa2 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
void
mlib_v_ImageChannelExtract_32_32(
    const mlib_f32 *src,
    mlib_s32 slb,
    mlib_f32 *dst,
    mlib_s32 dlb,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 cmask)
{
	mlib_d64 *sp, *dp;
	mlib_f32 *sa, *sl, *da, *dl, *dend;
	mlib_d64 sd0, sd1, sd2, sd3, dd0;
	mlib_s32 soff, xsize, cmask1, emask;
	mlib_s32 i, j;

	if (width <= 0)
		return;

	if ((8 * width == dlb) && (2 * slb == 3 * dlb)) {
		width *= height;
		height = 1;
	}

	width *= 2;

	if (cmask == 3) {
		src += 1;
		cmask = 6;
	}

	sa = sl = (void *)src;
	da = dl = dst;

	for (j = 0; j < height; j++) {
		cmask1 = cmask;
		xsize = width;
		dend = da + width - 1;

		if (((mlib_addr)da & 7) != 0) {
			(*da++) = *sa;
			sa++;
			xsize -= 1;
			cmask1 = ((cmask1 << 1) + 1) & 7;
		}

		dp = (mlib_d64 *)da;
		sp = (mlib_d64 *)((mlib_addr)sa & (~7));
		soff = (sa - (mlib_f32 *)sp) & 1;

		if (cmask1 != 5) {
			if (cmask1 == 3) {
				cmask1 <<= soff;
				sp += soff;
			} else
				cmask1 >>= soff;

			vis_write_bmask(0x456789AB, 0);

			if (cmask1 == 3) {

#pragma pipeloop(0)
				for (i = 0; i < xsize - 3; i += 4) {
					sd0 = (*sp++);
					sd1 = (*sp++);
					(*dp++) = vis_bshuffle(sd0, sd1);
					(*dp++) = (*sp++);
				}

/* end point handling */

				if ((mlib_addr)dp <= (mlib_addr)dend) {
					emask = vis_edge32(dp, dend);
					sd0 = vis_ld_d64_nf(sp);
					sd1 = vis_ld_d64_nf(sp + 1);
					dd0 = vis_bshuffle(sd0, sd1);
					vis_pst_32(dd0, dp++, emask);

					if ((mlib_addr)dp <= (mlib_addr)dend) {
						emask = vis_edge32(dp, dend);
						dd0 = vis_ld_d64_nf(sp + 2);
						vis_pst_32(dd0, dp, emask);
					}
				}
			} else {

#pragma pipeloop(0)
				for (i = 0; i < xsize - 3; i += 4) {
					(*dp++) = (*sp++);
					sd0 = (*sp++);
					sd1 = (*sp++);
					(*dp++) = vis_bshuffle(sd0, sd1);
				}

/* end point handling */

				if ((mlib_addr)dp <= (mlib_addr)dend) {
					emask = vis_edge32(dp, dend);
					dd0 = vis_ld_d64_nf(sp);
					vis_pst_32(dd0, dp++, emask);

					if ((mlib_addr)dp <= (mlib_addr)dend) {
						emask = vis_edge32(dp, dend);
						sd0 = vis_ld_d64_nf(sp + 1);
						sd1 = vis_ld_d64_nf(sp + 2);
						dd0 = vis_bshuffle(sd0, sd1);
						vis_pst_32(dd0, dp, emask);
					}
				}
			}
		} else {
			if (soff == 0) {
				vis_write_bmask(0x012389AB, 0);

#pragma pipeloop(0)
				for (i = 0; i < xsize - 3; i += 4) {
					sd0 = (*sp++);
					sd1 = (*sp++);
					sd2 = (*sp++);
					(*dp++) = vis_bshuffle(sd0, sd1);
					(*dp++) =
					    vis_freg_pair(vis_read_lo(sd1),
					    vis_read_lo(sd2));
				}

/* end point handling */

				if ((mlib_addr)dp <= (mlib_addr)dend) {
					emask = vis_edge32(dp, dend);
					sd0 = vis_ld_d64_nf(sp);
					sd1 = vis_ld_d64_nf(sp + 1);
					dd0 = vis_bshuffle(sd0, sd1);
					vis_pst_32(dd0, dp++, emask);

					if ((mlib_addr)dp <= (mlib_addr)dend) {
						emask = vis_edge32(dp, dend);
						sd2 = vis_ld_d64_nf(sp + 2);
						dd0 =
						    vis_freg_pair(vis_read_lo
						    (sd1), vis_read_lo(sd2));
						vis_pst_32(dd0, dp, emask);
					}
				}
			} else {
				vis_write_bmask(0x4567CDEF, 0);
				sd0 = vis_ld_d64_nf(sp);
				sp++;

#pragma pipeloop(0)
				for (i = 0; i < xsize - 3; i += 4) {
					sd1 = (*sp++);
					sd2 = (*sp++);
					sd3 = (*sp++);
					(*dp++) = vis_bshuffle(sd0, sd1);
					(*dp++) =
					    vis_freg_pair(vis_read_hi(sd2),
					    vis_read_hi(sd3));
					sd0 = sd3;
				}

/* end point handling */

				if ((mlib_addr)dp <= (mlib_addr)dend) {
					emask = vis_edge32(dp, dend);
					sd1 = vis_ld_d64_nf(sp);
					dd0 = vis_bshuffle(sd0, sd1);
					vis_pst_32(dd0, dp++, emask);

					if ((mlib_addr)dp <= (mlib_addr)dend) {
						emask = vis_edge32(dp, dend);
						sd2 = vis_ld_d64_nf(sp + 1);
						sd3 = vis_ld_d64_nf(sp + 2);
						dd0 =
						    vis_freg_pair(vis_read_hi
						    (sd2), vis_read_hi(sd3));
						vis_pst_32(dd0, dp, emask);
					}
				}
			}
		}

		sa = sl = (mlib_f32 *)((mlib_u8 *)sl + slb);
		da = dl = (mlib_f32 *)((mlib_u8 *)dl + dlb);
	}
Example #10
0
mlib_status
__mlib_MatrixMul_S16_S8_Mod(
	mlib_s16 *z,
	const STYPE * x,
	const STYPE * y,
	mlib_s32 m,
	mlib_s32 l,
	mlib_s32 n)
{
	mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y;
	mlib_d64 array[MAX_SIZE];
	mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3;
	mlib_s32 size, i, j, k, l8;

	if (!((m > 0) && (l > 0) && (n > 0))) {
		return (MLIB_FAILURE);
	}

	l8 = (l + 7) / 8;
	size = l8 * n + 2 * l8 + 4;

	if (size <= MAX_SIZE) {
		buff_y = array;
	} else {
		buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64));

		if (buff_y == NULL) {
			return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat,
				x, y, m, l, n, n, z);
		}
	}

	buff_x = buff_y + l8 * n;
	pbuff_y = buff_y;

/* transpose y matrix */
	for (i = 0; i < n; i++) {
		mlib_u8 *py = (mlib_u8 *)y + i;
		mlib_u8 *pp = (mlib_u8 *)pbuff_y;

		for (j = 0; j <= (l - 4); j += 4) {
			((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080;
			((mlib_s16 *)pp)[1] =
				((py[2 * n] << 8) | py[3 * n]) ^ 0x8080;
			py += 4 * n;
			pp += 4;
		}

		for (; j < l; j++) {
			(*pp++) = *py ^ 0x80;
			py += n;
		}

		for (; j < 8 * l8; j++) {
			(*pp++) = 0;
		}

		pbuff_y += l8;
	}

	for (j = 0; j < m; j++) {
		mlib_s32 x_sum = 0;

		for (i = 0; i < l; i++) {
			x_sum += x[i];
		}

		x_sum <<= 7;

		pbuff_x = buff_x;
		pbuff_y = buff_y;

/* copy x line */
		px = vis_alignaddr((void *)x, 0);
		x1 = vis_ld_d64_nf(px);
		px++;
		xx = 0;
		for (i = 0; i < l8; i++) {
			x0 = x1;
			x1 = vis_ld_d64_nf(px);
			px++;
			xx = vis_faligndata(x0, x1);
			pbuff_x[2 * i] =
				vis_fpmerge(vis_read_hi(xx), vis_fzeros());
			pbuff_x[2 * i + 1] =
				vis_fpmerge(vis_read_lo(xx), vis_fzeros());
		}

/* loop on y lines */
		for (i = 0; i < n; i += 2) {
			mlib_d64 *px = pbuff_x;
			mlib_d64 *py0 = pbuff_y;
			mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0;

			ds0 = ds1 = vis_fzero();

			LOAD;
			MUL;
			LOAD;

#pragma pipeloop(0)
			for (k = 0; k < l8; k++) {
				SUM;
				MUL;
				LOAD;
			}

			ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0),
				vis_read_lo(ds0)),
				vis_fpadd16s(vis_read_hi(ds1),
				vis_read_lo(ds1)));

			z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] -
				x_sum;

			if (i + 1 < n) {
				z[i + 1] =
					((mlib_s16 *)&ds0)[2] +
					((mlib_s16 *)&ds0)[3] - x_sum;
			}

			pbuff_y += 2 * l8;
		}

		z += n;
		x += l;
	}

	if (size > MAX_SIZE) {
		__mlib_free(buff_y);
	}

	return (MLIB_SUCCESS);
}
Example #11
0
mlib_status
__mlib_VectorConvert_S16_S32_Sat(
	mlib_s16 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	mlib_s32 *src = (void *)x;
	mlib_s16 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8;
	mlib_s32 c;
	mlib_s32 len_64, even_length, rest_64, length = n, i;

	if (n < 16) {
		PACK_S_S(mlib_s32, mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN);
	}

/*
 * First try to align destination address for 8 bytes.
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = *src) > MLIB_S16_MAX ? MLIB_S16_MAX
			: (c < MLIB_S16_MIN ? MLIB_S16_MIN : c);
		src++;
		length--;
	}

	vis_write_gsr(16 << 3);

	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;
	ddst = (mlib_d64 *)dst;

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d1),
				vis_fpackfix(d2));
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = (*dsrc++);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d1),
				vis_fpackfix(d2));
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d3),
				vis_fpackfix(d4));
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d4 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d4;
			d2 = (*dsrc++);
			d4 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d1, d2);
			d6 = vis_faligndata(d2, d4);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d5),
				vis_fpackfix(d6));
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d0 = d4;
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d0, d1);
			d6 = vis_faligndata(d1, d2);
			d7 = vis_faligndata(d2, d3);
			d8 = vis_faligndata(d3, d4);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d5),
				vis_fpackfix(d6));
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d7),
				vis_fpackfix(d8));
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c > MLIB_S16_MAX ? MLIB_S16_MAX
			: (c < MLIB_S16_MIN ? MLIB_S16_MIN : c);
	}

	return (MLIB_SUCCESS);
}
Example #12
0
mlib_status
__mlib_SignalEmphasize_S16S_S16S_Sat(
    mlib_s16 *dst,
    const mlib_s16 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_emphasize_struct *fist = filter;
	mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF);
	mlib_d64 w_maskor0  = vis_freg_pair(0.f, fist->v16_last0);
	mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000);
	mlib_d64 w_maskor1  = vis_freg_pair(0.f, fist->v16_last1);
	mlib_f32 v_mask	    = vis_to_float(0x80008000);
	mlib_f32 v_alpha    = fist->v_alpha;
	mlib_s16 *fdst	    = dst + n + n - 1;
	mlib_d64 *dpd, *dps, *dsrct1;
	mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc;
	mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7;
	mlib_s32 i, times, t1, t2;

/* check for obvious errors */

	if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) ||
	    (fist->type != MLIB_EMPH)) {
		return (MLIB_FAILURE);
	}

	vis_write_gsr(1 << 3);
	w_maskor0 = vis_fand(w_maskor0, w_maskand1);
	w_maskor1 = vis_fand(w_maskor1, w_maskand0);

	vis_alignaddr((void *)(-(mlib_addr)src), 0);
	w_maskand0 = vis_faligndata(w_maskand0, w_maskand0);
	w_maskor0 = vis_faligndata(w_maskor0, w_maskor0);
	w_maskand1 = vis_faligndata(w_maskand1, w_maskand1);
	w_maskor1 = vis_faligndata(w_maskor1, w_maskor1);

	dpd = vis_alignaddr(dst, 0);
	times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd;
	t1 = -((mlib_addr)(dst) & 7);
	t2 = t1 - 4;
	dps = vis_alignaddr((void *)src, t2);
	w_src0 = vis_ld_d64_nf(dps);
	dps++;
	w_src1 = vis_ld_d64_nf(dps);
	dps++;

	if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) {
		if (((mlib_addr)dps - (mlib_addr)src) >= 6) {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand0, w_src1);
			w_src1 = vis_for(w_maskor0, w_src1);
		}

		if (((mlib_addr)dps - (mlib_addr)src) >= 8) {
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
		}

		w_lsrc = vis_faligndata(w_src0, w_src1);
		dsrct1 = vis_alignaddr((void *)src, t1);

		if (dps - 2 != dsrct1) {
			w_src2 = *dps;
			dps++;
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = *dps;
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dpd++;
				dps++;
			}
		} else {
			w_src = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;

			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			MLIB_MIX;
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;
				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;

				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;
				dpd++;
			}
		}
	} else {
		w_src = w_src1;

		if ((mlib_addr)src & 7) {
			times--;

			if (((mlib_addr)src & 7) == 2) {
				w_src0 = vis_fand(w_maskand0, w_src0);
				w_src0 = vis_for(w_maskor0, w_src0);
			} else {
				w_src1 = vis_fand(w_maskand0, w_src1);
				w_src1 = vis_for(w_maskor0, w_src1);
			}

			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			w_src0 = w_src1;
			w_src1 = *dps;
			w_src = w_src1;
			w_lsrc = vis_faligndata(w_src0, w_src1);
			dps++;

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
			dpd++;
		} else {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;
		}

		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);

		MLIB_MIX;

		w_src1 = w_src;
		w_dst = vis_fpackfix_pair(dr2, dr3);
		dps++;
		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);
		dps++;

		for (i = 0; i < times; i++) {
			*dpd = w_dst;

			MLIB_MIX;

			w_src1 = w_src;
			w_src = vis_ld_d64_nf(dps);
			w_lsrc = vis_faligndata(w_src1, w_src);
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			dpd++;

		}
	}

	if (times >= 0) {
		vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst));
	}
	((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2];
	((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1];

	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv5x5_8nw_mask(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *dend, *adr_src;

/* pointers to src rows */
	mlib_u8 *sa, *sa2, *sa3, *sa4, *sa5, *sa6, *sa_6, *prow;

/* pointers to rows in interm. src buf */
	mlib_u8 *buff_src, *sbuf1, *sbuf2, *sbuf3, *sbuf4, *sbuf5, *s_buf1;

/* pointers to row in interm. dst buf */
	mlib_u8 *dbuf, *d_buf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst, *ddst1;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, rnd;

/* data */
	mlib_d64 d1, d2, d3, d4, d5, d_1, d_2, d_3, d_4, d_5;

/* temp. data, used in faligndata */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;

/* shifted data */
	mlib_d64 d21, d22, d23, d24, d25;
	mlib_f32 k1k2, k17k18, k19k20, k21k22, k23k24, k25;
	mlib_f32 k3k4, k5k6, k7k8, k9k10, k11k12, k13k14, k15k16;
	mlib_s32 rval, gsr_scale, i, j, nchannel, nchannel1, chan, testchan;

/* temp, used in load-store */
	mlib_s32 t1, t2, t3, t4, t5, t6, t7, t8, tt1, tt2, tt3, tt4, tt5, tt6,
	    tt7, tt8;

	adr_src = mlib_ImageGetData(src);
	adr_dst = mlib_ImageGetData(dst);
	nchannel = mlib_ImageGetChannels(src);
	slb = mlib_ImageGetStride(src);
	dlb = mlib_ImageGetStride(dst);
	dh = mlib_ImageGetHeight(dst);
	dw = mlib_ImageGetWidth(dst);

/* buf_slb - 8-byte aligned */
	buf_slb = (dw + 16) & (~7);
/* alloc. interm. src and dst buffer */
	buff_src = (mlib_u8 *)__mlib_malloc(7 * buf_slb * sizeof (mlib_u8) + 8);

	if (buff_src == NULL)
		return (MLIB_FAILURE);
/* edge - no write */
	dw -= 4;
	dh -= 4;

/*
 * The 8x16 mult has built-in 8-bit R shift, and fpack16 has 7-bit
 * fixed R shift (preceded by variable-bit L shift controlled by GSR
 * scalefactor field). Thus net R shift = (8+7)-(GSR.scalefactor_field),
 * so GSR.scalefactor_field = 15-(net R shift):
 */
	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3) + 1);
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));
	sbuf1 = (mlib_u8 *)((mlib_addr)(buff_src + 8) & (~7));
	sbuf2 = sbuf1 + buf_slb;
	sbuf3 = sbuf2 + buf_slb;
	sbuf4 = sbuf3 + buf_slb;
	sbuf5 = sbuf4 + buf_slb;
	dbuf = sbuf5 + buf_slb;

	LOAD_KERNEL_INTO_FLOAT();

	testchan = 1;

	for (chan = nchannel - 1; chan >= 0; chan--) {
		if ((cmask & testchan) == 0) {
			testchan <<= 1;
			continue;
		}

		testchan <<= 1;
		sa = adr_src + chan;
		sa2 = sa + slb;
		sa3 = sa2 + slb;
		sa4 = sa3 + slb;
		sa5 = sa4 + slb;
		sa_6 = sa6 = sa5 + slb;
		d_a = adr_dst + (dlb << 1) + (nchannel << 1) + chan;

/* load interm. src buff */
		for (i = 0, j = 0; j < (dw + 4); i += nchannel, j++) {
			sbuf1[j] = sa5[i];
			sbuf2[j] = sa[i];
			sbuf3[j] = sa2[i];
			sbuf4[j] = sa3[i];
			sbuf5[j] = sa4[i];
		}

		for (j = 0; j < dh - 1; j++) {
			ddst1 = ddst = (mlib_d64 *)(dbuf);
			d_buf = (dbuf - 8);
			da = d_a;
			dend = da + (dw - 1) * nchannel;
			prow = sbuf1;
			sbuf1 = sbuf2;
			sbuf2 = sbuf3;
			sbuf3 = sbuf4;
			sbuf4 = sbuf5;
			sbuf5 = prow;
			s1 = (mlib_d64 *)sbuf1;
			s2 = (mlib_d64 *)sbuf2;
			s3 = (mlib_d64 *)sbuf3;
			s4 = (mlib_d64 *)sbuf4;
			s5 = (mlib_d64 *)sbuf5;
			s_buf1 = sbuf1;
			d1 = *s1;
			d2 = *s2;
			d3 = *s3;
			nchannel1 = 0;

#pragma pipeloop(0)
			for (i = 0; i < dw; i += 8) {
				d_1 = *(s1 + 1);
				d_2 = *(s2 + 1);
				d_3 = *(s3 + 1);
				out0 = out1 = rnd;
				t1 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AU(d1, k1k2);
				t2 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AL(d2, k5k6);
				t3 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AU(d3, k11k12);
				t4 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d21 = vis_faligndata(d1, d_1);
				dt_1 = vis_faligndata(d_1, d1);
				t5 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d22 = vis_faligndata(d2, d_2);
				dt_2 = vis_faligndata(d_2, d2);
				t6 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d23 = vis_faligndata(d3, d_3);
				t7 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				dt_3 = vis_faligndata(d_3, d3);
				t8 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AL(d21, k1k2);
				(*s_buf1++) = t1;
				CONV_AU(d22, k7k8);
				(*s_buf1++) = t2;
				CONV_AL(d23, k11k12);
				(*s_buf1++) = t3;
				SHIFT_U8_1;
				CONV_AU(d21, k3k4);
				(*s_buf1++) = t4;
				CONV_AL(d22, k7k8);
				CONV_AU(d23, k13k14);
				d21 = vis_faligndata(d21, dt_1);
				d22 = vis_faligndata(d22, dt_2);
				(*s_buf1++) = t5;
				d23 = vis_faligndata(d23, dt_3);
				CONV_AL(d21, k3k4);
				(*s_buf1++) = t6;
				CONV_AU(d22, k9k10);
				(*s_buf1++) = t7;
				CONV_AL(d23, k13k14);
				d21 =
				    vis_freg_pair(vis_read_lo(d1),
				    vis_read_hi(d_1));
				CONV_AU(d21, k5k6);
				d22 =
				    vis_freg_pair(vis_read_lo(d2),
				    vis_read_hi(d_2));
				CONV_AL(d22, k9k10);
				d23 =
				    vis_freg_pair(vis_read_lo(d3),
				    vis_read_hi(d_3));
				CONV_AU(d23, k15k16);
				(*s_buf1++) = t8;
				ddst[0] = out0;
				ddst[1] = out1;
				ddst += 2;
				d1 = d_1;
				d2 = d_2;
				d3 = d_3;
				s1++;
				s2++;
				s3++;
			}

			ddst = (mlib_d64 *)(dbuf);
			d4 = *s4;
			d5 = *s5;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
			for (i = 0; i < dw; i += 8) {
				d_4 = *(s4 + 1);
				d_5 = *(s5 + 1);
				out0 = ddst[0];
				out1 = ddst[1];
				ddst += 2;
				tt1 = (*d_buf++);
				CONV_AL(d4, k15k16);
				tt2 = (*d_buf++);
				CONV_AU(d5, k21k22);
				d24 = vis_faligndata(d4, d_4);
				tt3 = (*d_buf++);
				dt_4 = vis_faligndata(d_4, d4);
				d25 = vis_faligndata(d5, d_5);
				tt4 = (*d_buf++);
				dt_5 = vis_faligndata(d_5, d5);
				tt5 = (*d_buf++);
				CONV_AU(d24, k17k18);
				tt6 = (*d_buf++);
				CONV_AL(d25, k21k22);
				tt7 = (*d_buf++);
				SHIFT_U8_2;
				tt8 = (*d_buf++);
				CONV_AL(d24, k17k18);
				*da = tt1;
				da += nchannel1;
				CONV_AU(d25, k23k24);
				*da = tt2;
				da += nchannel1;
				d24 = vis_faligndata(d24, dt_4);
				*da = tt3;
				da += nchannel1;
				d25 = vis_faligndata(d25, dt_5);
				*da = tt4;
				da += nchannel1;
				CONV_AU(d24, k19k20);
				*da = tt5;
				da += nchannel1;
				CONV_AL(d25, k23k24);
				*da = tt6;
				da += nchannel1;
				d24 =
				    vis_freg_pair(vis_read_lo(d4),
				    vis_read_hi(d_4));
				CONV_AL(d24, k19k20);
				*da = tt7;
				da += nchannel1;
				d25 =
				    vis_freg_pair(vis_read_lo(d5),
				    vis_read_hi(d_5));
				CONV_AU(d25, k25);
				*da = tt8;
				da += nchannel1;
				(*ddst1++) = vis_fpack16_pair(out0, out1);
				d4 = d_4;
				d5 = d_5;
				s4++;
				s5++;
				nchannel1 = nchannel;
			}

			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
			}

			sa_6 = sa6 = sa6 + slb;
			d_a += dlb;
		}

/* process last row - no need to load data */
		ddst1 = ddst = (mlib_d64 *)(dbuf);
		d_buf = (dbuf - 8);
		da = d_a;
		dend = da + (dw - 1) * nchannel;
		prow = sbuf1;
		sbuf1 = sbuf2;
		sbuf2 = sbuf3;
		sbuf3 = sbuf4;
		sbuf4 = sbuf5;
		sbuf5 = prow;
		s1 = (mlib_d64 *)sbuf1;
		s2 = (mlib_d64 *)sbuf2;
		s3 = (mlib_d64 *)sbuf3;
		s4 = (mlib_d64 *)sbuf4;
		s5 = (mlib_d64 *)sbuf5;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		nchannel1 = 0;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			d21 = vis_faligndata(d1, d_1);
			dt_1 = vis_faligndata(d_1, d1);
			d22 = vis_faligndata(d2, d_2);
			dt_2 = vis_faligndata(d_2, d2);
			d23 = vis_faligndata(d3, d_3);
			dt_3 = vis_faligndata(d_3, d3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k7k8);
			CONV_AL(d23, k11k12);
			SHIFT_U8_1;
			CONV_AU(d21, k3k4);
			CONV_AL(d22, k7k8);
			CONV_AU(d23, k13k14);
			d21 = vis_faligndata(d21, dt_1);
			d22 = vis_faligndata(d22, dt_2);
			d23 = vis_faligndata(d23, dt_3);
			CONV_AL(d21, k3k4);
			CONV_AU(d22, k9k10);
			CONV_AL(d23, k13k14);
			d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1));
			CONV_AU(d21, k5k6);
			d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2));
			CONV_AL(d22, k9k10);
			d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3));
			CONV_AU(d23, k15k16);
			ddst[0] = out0;
			ddst[1] = out1;
			ddst += 2;
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = (mlib_d64 *)(dbuf);
		d4 = *s4;
		d5 = *s5;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_4 = *(s4 + 1);
			d_5 = *(s5 + 1);
			out0 = ddst[0];
			out1 = ddst[1];
			ddst += 2;
			tt1 = (*d_buf++);
			CONV_AL(d4, k15k16);
			tt2 = (*d_buf++);
			CONV_AU(d5, k21k22);
			d24 = vis_faligndata(d4, d_4);
			tt3 = (*d_buf++);
			dt_4 = vis_faligndata(d_4, d4);
			d25 = vis_faligndata(d5, d_5);
			tt4 = (*d_buf++);
			dt_5 = vis_faligndata(d_5, d5);
			tt5 = (*d_buf++);
			CONV_AU(d24, k17k18);
			tt6 = (*d_buf++);
			CONV_AL(d25, k21k22);
			tt7 = (*d_buf++);
			SHIFT_U8_2;
			tt8 = (*d_buf++);
			CONV_AL(d24, k17k18);
			*da = tt1;
			da += nchannel1;
			CONV_AU(d25, k23k24);
			*da = tt2;
			da += nchannel1;
			d24 = vis_faligndata(d24, dt_4);
			*da = tt3;
			da += nchannel1;
			d25 = vis_faligndata(d25, dt_5);
			*da = tt4;
			da += nchannel1;
			CONV_AU(d24, k19k20);
			*da = tt5;
			da += nchannel1;
			CONV_AL(d25, k23k24);
			*da = tt6;
			da += nchannel1;
			d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4));
			CONV_AL(d24, k19k20);
			*da = tt7;
			da += nchannel1;
			d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5));
			CONV_AU(d25, k25);
			*da = tt8;
			da += nchannel1;
			(*ddst1++) = vis_fpack16_pair(out0, out1);
			d4 = d_4;
			d5 = d_5;
			s4++;
			s5++;
			nchannel1 = nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
		}
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv5x5_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3, *sbuf4, *sbuf5;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf, *dbuf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d3, d4, d5;

/* data */
	mlib_d64 d11, d12, d13, d14, d15;

/* data */
	mlib_d64 d21, d22, d23, d24, d25;

/* data */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;
	mlib_f32 k1k2, k3k4, k5k6, k7k8;
	mlib_f32 k9k10, k11k12, k13k14, k15k16;
	mlib_f32 k17k18, k19k20, k21k22, k23k24, k25;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 4;
	dw *= 4;
	dh -= 4;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	sa3 = sa2 + slb;
	sa4 = sa3 + slb;
	d_a = adr_dst + 2 * dlb + 8;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf4, sa2);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf5, sa3);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf5, sa4);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER_NF(16);

		vis_alignaddr(s1, 4);
		dbuf1 = dbuf;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		d11 = *(s1 + 1);
		d12 = *(s2 + 1);
		d13 = *(s3 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d21 = *(s1 + 2);
			d22 = *(s2 + 2);
			d23 = *(s3 + 2);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			dt_1 = vis_faligndata(d1, d11);
			dt_2 = vis_faligndata(d2, d12);
			dt_3 = vis_faligndata(d3, d13);
			CONV_AL(dt_1, k1k2);
			CONV_AU(dt_2, k7k8);
			CONV_AL(dt_3, k11k12);
			CONV_AU(d11, k3k4);
			CONV_AL(d12, k7k8);
			CONV_AU(d13, k13k14);
			dt_1 = vis_faligndata(d11, d21);
			dt_2 = vis_faligndata(d12, d22);
			dt_3 = vis_faligndata(d13, d23);
			CONV_AL(dt_1, k3k4);
			CONV_AU(dt_2, k9k10);
			CONV_AL(dt_3, k13k14);
			CONV_AU(d21, k5k6);
			CONV_AL(d22, k9k10);
			CONV_AU(d23, k15k16);
			dbuf1[0] = out0;
			dbuf1[1] = out1;
			dbuf1 += 2;
			d1 = d11;
			d2 = d12;
			d3 = d13;
			d11 = d21;
			d12 = d22;
			d13 = d23;
			s1++;
			s2++;
			s3++;
		}

		dbuf1 = dbuf;
		d4 = *s4;
		d5 = *s5;
		d14 = *(s4 + 1);
		d15 = *(s5 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d24 = *(s4 + 2);
			d25 = *(s5 + 2);
			out0 = dbuf1[0];
			out1 = dbuf1[1];
			CONV_AL(d4, k15k16);
			CONV_AU(d5, k21k22);
			dt_4 = vis_faligndata(d4, d14);
			dt_5 = vis_faligndata(d5, d15);
			CONV_AU(dt_4, k17k18);
			CONV_AL(dt_5, k21k22);
			CONV_AL(d14, k17k18);
			CONV_AU(d15, k23k24);
			dt_4 = vis_faligndata(d14, d24);
			dt_5 = vis_faligndata(d15, d25);
			CONV_AU(dt_4, k19k20);
			CONV_AL(dt_5, k23k24);
			CONV_AL(d24, k19k20);
			CONV_AU(d25, k25);
			dbuf1 += 2;
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d4 = d14;
			d5 = d15;
			d14 = d24;
			d15 = d25;
			s4++;
			s5++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa4 = sa4 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
					DIV_ALPHA_3CH();
				}

				if (i < ww) {
					GET_ALPHA_3CH_2_NF();
					DIV_ALPHA_3CH_NF();
				}
			}

		} else {	/* if (channel == 2) */

#pragma pipeloop(0)
			for (i = 0; i < ww; i++) {
				ss = *sp;
				a0 = vis_freg_pair(*(mlib_f32 *)(p_tbl + ap[0]),
				    *(mlib_f32 *)(p_tbl +
				    vis_ld_u8_nf(ap + 2)));
				a1 = vis_freg_pair(*(mlib_f32 *)(p_tbl +
					vis_ld_u8_nf(ap + 4)),
				    *(mlib_f32 *)(p_tbl +
				    vis_ld_u8_nf(ap + 6)));
				DIV_ALPHA(d0, vis_read_hi(ss), a0);
				DIV_ALPHA(d1, vis_read_lo(ss), a1);
				*dp = vis_fpack16_pair(d0, d1);
				ap += 8;
				sp++;
				dp++;
			}
		}

		if (dflag) {