Example #1
0
mlib_status
__mlib_VideoAddBlock_U8_S16(
	mlib_u8 *curr_block,
	const mlib_s16 *mc_block,
	mlib_s32 stride)
{
	mlib_s32 y;
	mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd;
	mlib_f32 zeros = vis_fzeros();

/*
 *   mlib_s32 mlib_imult = 0x100;
 *   mlib_f32 mult  = *(mlib_f32*) & mlib_imult;
 */
	mlib_f32 mult = vis_to_float(0x100);

	vis_write_gsr(7 << 3);

	dp = (mlib_d64 *)curr_block;
	sp = (mlib_d64 *)mc_block;

#pragma pipeloop(0)
	for (y = 0; y < 8; y++) {

		dd = *dp;
		s1hi = (*sp++);
		s1lo = (*sp++);
		s2hi = vis_fpmerge(zeros, vis_read_hi(dd));
		s2lo = vis_fmul8x16al(vis_read_lo(dd), mult);

		s1hi = vis_fpadd16(s1hi, s2hi);
		s1lo = vis_fpadd16(s1lo, s2lo);

		*dp = vis_fpack16_pair(s1hi, s1lo);
		dp = (mlib_d64 *)((mlib_u8 *)dp + stride);
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2RGB411_nonalign(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *u,
    const mlib_u8 *v,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 rgb_stride,
    mlib_s32 y_stride,
    mlib_s32 uv_stride)
{
    /* pointers to src address */
    mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

    /* pointers to dst address */
    mlib_u8 *dp, *dl;

    /* all. pointer to y */
    mlib_d64 *spy;

    /* all. pointers to u, v */
    mlib_d64 *dfu, *dfv;

    /* u, v data */
    mlib_f32 fu, fv;

    /* y data */
    mlib_d64 dy0, dy1, dy2, dy3;
    mlib_d64 ddy1, ddy2, ddy3, ddy4;
    mlib_d64 du0, du1, fu0, fu1;
    mlib_d64 dv1, dv2, fv0, fv1;
    mlib_d64 dr, dr1, dr2, dr3, dr4;
    mlib_d64 dg, dg1, dg2, dg3, dg4;
    mlib_d64 db, db1, db2, db3, db4;
    mlib_d64 dtmp;

    /* 1.1644  * 4096 */
    mlib_f32 f0 = vis_to_float(0x12a1);

    /* 2.0184  * 8192 */
    mlib_f32 f1 = vis_to_float(0x4097);

    /* -0.3920 * 8192 */
    mlib_f32 f4 = vis_to_float(0xf375);

    /* -0.8132 * 8192 */
    mlib_f32 f5 = vis_to_float(0xe5fa);

    /* 1.5966  * 8192 */
    mlib_f32 f8 = vis_to_float(0x3317);

    /* -276.9856 * 32 */
    mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

    /* 135.6352  * 32 */
    mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

    /* -222.9952 * 32 */
    mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
    mlib_f32 fscale = vis_to_float(0x80808080);

    /* loop variable */
    mlib_s32 i, j;
    mlib_d64 *buf, BUFF[16 * 1024];
    mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22;
    mlib_u8 *tmp;

    if (width * 3 > 16 * 1024) {
        tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
        buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
    } else {
        buf = (mlib_d64 *)BUFF;
    }

    /*
     * initialize GSR scale factor
     */
    vis_write_gsr(3 << 3);

    sp1 = sl1 = (mlib_u8 *)y;
    sp2 = sl2 = (mlib_u8 *)u;
    sp3 = sl3 = (mlib_u8 *)v;

    dp = (mlib_u8 *)buf;
    dl = rgb;
    ddp = (mlib_d64 *)dp;

    /*
     * row loop
     */
    for (j = 0; j < height; j++) {
        spy = (mlib_d64 *)vis_alignaddr(sp1, 0);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = (*dfu++);
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = (*dfv++);
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        dy0 = (*spy++);
        dy3 = vis_ld_d64_nf(spy);
        spy++;
        vis_alignaddr(sp1, 0);
        dy1 = vis_faligndata(dy0, dy3);
        dy0 = vis_ld_d64_nf(spy);
        spy++;
        dy2 = vis_faligndata(dy3, dy0);

        du0 = vis_fmul8x16al(fu, f1);
        db = vis_fpadd16(du0, doff0);

        du1 = vis_fmul8x16al(fu, f4);
        dv1 = vis_fmul8x16al(fv, f5);
        dtmp = vis_fpadd16(du1, dv1);
        dg = vis_fpadd16(dtmp, doff1);

        dv2 = vis_fmul8x16al(fv, f8);
        dr = vis_fpadd16(dv2, doff2);

        ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
        ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

        ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
        ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

        db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
        db1 = vis_fpadd16(ddy1, db1);

        db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
        db2 = vis_fpadd16(ddy2, db2);

        db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
        db3 = vis_fpadd16(ddy3, db3);

        db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
        db4 = vis_fpadd16(ddy4, db4);

        dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
        dg1 = vis_fpadd16(ddy1, dg1);

        dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
        dg2 = vis_fpadd16(ddy2, dg2);

        dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
        dg3 = vis_fpadd16(ddy3, dg3);

        dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
        dg4 = vis_fpadd16(ddy4, dg4);

        dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
        dr1 = vis_fpadd16(ddy1, dr1);

        dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
        dr2 = vis_fpadd16(ddy2, dr2);

        dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
        dr3 = vis_fpadd16(ddy3, dr3);

        dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
        dr4 = vis_fpadd16(ddy4, dr4);

        db = vis_fpack16_pair(db1, db2);
        db1 = vis_fpack16_pair(db3, db4);

        dr = vis_fpack16_pair(dr1, dr2);
        dr1 = vis_fpack16_pair(dr3, dr4);

        dg = vis_fpack16_pair(dg1, dg2);
        dg1 = vis_fpack16_pair(dg3, dg4);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = vis_ld_d64_nf(dfu);
        dfu++;
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = vis_ld_d64_nf(dfv);
        dfv++;
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        /*
         * 16-pixel column loop
         */
#pragma pipeloop(0)
        for (i = 0; i <= width - 16; i += 16) {

            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            dd02 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            dd12 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            dd22 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            ddp[3] = vis_bshuffle(dd02, db1);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            ddp[4] = vis_bshuffle(dd12, db1);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            ddp[5] = vis_bshuffle(dd22, db1);

            dy3 = vis_ld_d64_nf(spy);
            spy++;
            vis_alignaddr(sp1, 0);
            dy1 = vis_faligndata(dy0, dy3);
            dy0 = vis_ld_d64_nf(spy);
            spy++;
            dy2 = vis_faligndata(dy3, dy0);

            du0 = vis_fmul8x16al(fu, f1);
            db = vis_fpadd16(du0, doff0);

            du1 = vis_fmul8x16al(fu, f4);
            dv1 = vis_fmul8x16al(fv, f5);
            dtmp = vis_fpadd16(du1, dv1);
            dg = vis_fpadd16(dtmp, doff1);

            dv2 = vis_fmul8x16al(fv, f8);
            dr = vis_fpadd16(dv2, doff2);

            ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
            ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

            ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
            ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

            db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
            db1 = vis_fpadd16(ddy1, db1);

            db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
            db2 = vis_fpadd16(ddy2, db2);

            db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
            db3 = vis_fpadd16(ddy3, db3);

            db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
            db4 = vis_fpadd16(ddy4, db4);

            dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
            dg1 = vis_fpadd16(ddy1, dg1);

            dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
            dg2 = vis_fpadd16(ddy2, dg2);

            dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
            dg3 = vis_fpadd16(ddy3, dg3);

            dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
            dg4 = vis_fpadd16(ddy4, dg4);

            dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
            dr1 = vis_fpadd16(ddy1, dr1);

            dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
            dr2 = vis_fpadd16(ddy2, dr2);

            dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
            dr3 = vis_fpadd16(ddy3, dr3);

            dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
            dr4 = vis_fpadd16(ddy4, dr4);

            db = vis_fpack16_pair(db1, db2);
            db1 = vis_fpack16_pair(db3, db4);

            dr = vis_fpack16_pair(dr1, dr2);
            dr1 = vis_fpack16_pair(dr3, dr4);

            dg = vis_fpack16_pair(dg1, dg2);
            dg1 = vis_fpack16_pair(dg3, dg4);

            dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
            fu0 = vis_ld_d64_nf(dfu);
            dfu++;
            fu1 = vis_ld_d64_nf(dfu);
            dfu++;
            fu = vis_read_hi(vis_faligndata(fu0, fu1));
            sp2 += 4;

            dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
            fv0 = vis_ld_d64_nf(dfv);
            dfv++;
            fv1 = vis_ld_d64_nf(dfv);
            dfv++;
            fv = vis_read_hi(vis_faligndata(fv0, fv1));
            sp3 += 4;

            ddp += 6;
        }

        if (i <= width - 8) {
            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            db = db1;
            dr = dr1;
            dg = dg1;
            ddp += 3;
            i += 8;
        }

        dp = (mlib_u8 *)ddp;

        vis_alignaddr((void *)(width - i), 0);
        db = vis_faligndata(db, db);
        dg = vis_faligndata(dg, dg);
        dr = vis_faligndata(dr, dr);
        dp += ((width - i - 1) * 3);

        vis_alignaddr((void *)7, 0);
        for (; i < width; i++) {
            STORE_PIXEL(0, 1, 2);
            dp -= 3;
        }

        sp1 = sl1 = sl1 + y_stride;
        sp2 = sl2 = sl2 + uv_stride;
        sp3 = sl3 = sl3 + uv_stride;
        __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3);

        dl = dp = dl + rgb_stride;
        dp = (mlib_u8 *)buf;
        ddp = (mlib_d64 *)dp;
    }

    if (width * 3 > 16 * 1024)
        __mlib_free(tmp);

    return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv3x3_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2, d21, d22;

/* data */
	mlib_d64 d3, d_3, d23;
	mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1, sd00;
	mlib_s32 emask, cmask1;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	cmask = ((cmask & 0xf) << 4) + (cmask & 0xf);
	cmask = (cmask << 8) + (cmask);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 2;
	dw *= 4;
	dh -= 2;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	d_a = adr_dst + dlb + 4;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf3, sa2);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(8);

		vis_alignaddr(s1, 4);
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k3k4);
			CONV_AU(d3, k7k8);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			d23 = vis_faligndata(d3, d_3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k5k6);
			CONV_AL(d23, k7k8);
			CONV_AU(d_1, k3k4);
			CONV_AL(d_2, k5k6);
			CONV_AU(d_3, k9k9);
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = dbuf;
/* prepare the destination addresses */
		dp = (mlib_d64 *)((mlib_addr)da & (~7));
		i = (mlib_addr)dp - (mlib_addr)da;
		cmask1 = cmask >> (-i);
		ddst = vis_alignaddr(ddst, i);
/* generate edge mask for the start point */
		emask = vis_edge8(da, dend);
		sd1 = ddst[0];

		if (emask != 0xff) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd0 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd0, dp++, emask & cmask1);
			ddst++;
			i += 8;
		}
#pragma pipeloop(0)
		for (; i <= (dw - 8); i += 8) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd00 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd00, dp++, cmask1);
			ddst++;
		}

		if (i < dw) {
			sd0 = vis_faligndata(sd1, ddst[1]);
			emask = vis_edge8(dp, dend);
			vis_pst_8(sd0, dp, emask & cmask1);
		}

		sa2 = sa2 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
static void
mlib_v_VideoYUV2ABGR_aarray_411(
	mlib_u32 *abgr,
	const mlib_d64 *y,
	const mlib_f32 *u,
	const mlib_f32 *v,
	const mlib_d64 *a_array,
	mlib_s32 count,
	mlib_s32 left,
	mlib_s32 isrgb)
{
/* all. pointer to dst */
	mlib_d64 *dpp = (mlib_d64 *)abgr;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 *dpa, da0, da1, da2, da3, da4;
	mlib_d64 dtmp;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i;

	if (isrgb) {
		f0 = vis_to_float(0x12a1);
		f1 = vis_to_float(0x3317);
		f4 = vis_to_float(0xe5fa);
		f5 = vis_to_float(0xf375);
		f8 = vis_to_float(0x4097);
		doff0 = vis_to_double_dup(0xe420e420);
		doff1 = vis_to_double_dup(0x10f410f4);
		doff2 = vis_to_double_dup(0xdd60dd60);
	}

	dpa = vis_alignaddr((void *)a_array, 0);

	dy1 = (*y++);
	dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++;
	fu = (*u++);
	fv = (*v++);
	da2 = (*dpa++);
	da3 = vis_ld_d64_nf(dpa); dpa++;
	da4 = vis_ld_d64_nf(dpa); dpa++;

	du0 = vis_fmul8x16al(fu, f1);
	du1 = vis_fmul8x16al(fu, f4);
	dv1 = vis_fmul8x16al(fv, f5);
	dv2 = vis_fmul8x16al(fv, f8);

	if (!((mlib_addr)abgr & 7)) {
#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dpp[8 * i] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 1] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dpp[8 * i + 2] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 3] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dpp[8 * i + 4] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 5] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dpp[8 * i + 6] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 7] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	} else {
		mlib_d64 dd;

#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd);

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	}

	if (left) {
		mlib_d64 res_buf[8];

		da0 = vis_faligndata(da2, da3);
		da1 = vis_faligndata(da3, da4);

		ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
		ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

		ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
		ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

		db = vis_fpadd16(du0, doff0);

		dtmp = vis_fpadd16(du1, dv1);
		dg = vis_fpadd16(dtmp, doff1);

		dr = vis_fpadd16(dv2, doff2);

		db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
		db1 = vis_fpadd16(ddy1, db1);

		db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
		db2 = vis_fpadd16(ddy2, db2);

		db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
		db3 = vis_fpadd16(ddy3, db3);

		db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
		db4 = vis_fpadd16(ddy4, db4);

		dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
		dg1 = vis_fpadd16(ddy1, dg1);

		dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
		dg2 = vis_fpadd16(ddy2, dg2);

		dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
		dg3 = vis_fpadd16(ddy3, dg3);

		dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
		dg4 = vis_fpadd16(ddy4, dg4);

		dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
		dr1 = vis_fpadd16(ddy1, dr1);

		dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
		dr2 = vis_fpadd16(ddy2, dr2);

		dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
		dr3 = vis_fpadd16(ddy3, dr3);

		dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
		dr4 = vis_fpadd16(ddy4, dr4);

		dr = vis_fpack16_pair(dr1, dr2);
		dr1 = vis_fpack16_pair(dr3, dr4);

		dg = vis_fpack16_pair(dg1, dg2);
		dg1 = vis_fpack16_pair(dg3, dg4);

		db = vis_fpack16_pair(db1, db2);
		db1 = vis_fpack16_pair(db3, db4);

		dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
		dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

		res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
		dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

		res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
		dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

		res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
		dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

		res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		for (i = 0; i < left; i++)
			((mlib_f32 *)dpp)[16 * count + i] =
				((mlib_f32 *)res_buf)[i];
	}
}
Example #7
0
mlib_status
__mlib_VectorConvert_U8_S16_Sat(
	mlib_u8 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6, d7;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s16 c;

	if (n < 16) {
		PACK_S_U_DF(mlib_s16, mlib_u8, MLIB_U8_MAX, 0);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c =
			(*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d4 = (*dsrc++);
			d5 = (*dsrc++);
			d3 = vis_fpack16_pair(d4, d5);
			(*ddst++) = d3;
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d5 = (*dsrc++);
			d6 = (*dsrc++);
			d3 = vis_fpack16_pair(d1, d2);
			d7 = vis_fpack16_pair(d5, d6);
			(*ddst++) = d3;
			(*ddst++) = d7;
		}
	} else {

/*
 * Source address is 2-byte aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d1, d2);
			d3 = vis_fpack16_pair(d3, d4);
			(*ddst++) = d3;
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = (*dsrc++);
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d5 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d6 = vis_faligndata(d1, d2);
			d3 = vis_fpack16_pair(d3, d4);
			d5 = vis_fpack16_pair(d5, d6);
			(*ddst++) = d3;
			(*ddst++) = d5;
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) < 0 ? 0 : (c >
			MLIB_U8_MAX ? MLIB_U8_MAX : c);

	return (MLIB_SUCCESS);
}
		} else {	/* if (channel == 2) */

#pragma pipeloop(0)
			for (i = 0; i < ww; i++) {
				ss = *sp;
				a0 = vis_freg_pair(*(mlib_f32 *)(p_tbl + ap[0]),
				    *(mlib_f32 *)(p_tbl +
				    vis_ld_u8_nf(ap + 2)));
				a1 = vis_freg_pair(*(mlib_f32 *)(p_tbl +
					vis_ld_u8_nf(ap + 4)),
				    *(mlib_f32 *)(p_tbl +
				    vis_ld_u8_nf(ap + 6)));
				DIV_ALPHA(d0, vis_read_hi(ss), a0);
				DIV_ALPHA(d1, vis_read_lo(ss), a1);
				*dp = vis_fpack16_pair(d0, d1);
				ap += 8;
				sp++;
				dp++;
			}
		}

		if (dflag) {
			MEM_COPY(buffd, dl, width * sizeof (mlib_u8));
		}

		sl += sstride;
		dl += dstride;
	}

	__mlib_free(buffs);
Example #9
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 fzero = vis_fzeros();
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_u8 c;
	mlib_d64 dsp = vis_to_double_dup(0x800080);
	mlib_d64 rst = vis_to_double_dup(0x80808080);
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d4 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			d2 = vis_fpmerge(fzero, vis_read_hi(d4));
			d3 = vis_fmul8x16al(vis_read_lo(d4), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
			(*ddst++) = vis_fxor(d4, rst);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d1 = vis_faligndata(d1, d2);
			d3 = vis_fmul8x16al(vis_read_hi(d1), fm);
			d4 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpadd16(dsp, d4);
			d1 = vis_fpack16_pair(d3, d4);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_faligndata(d1, d2);
			d4 = vis_fmul8x16al(vis_read_hi(d3), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d3), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d3 = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8x16al(vis_read_hi(d6), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d6), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d6 = vis_fpack16_pair(d4, d5);
			(*ddst++) = vis_fxor(d3, rst);
			(*ddst++) = vis_fxor(d6, rst);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c;

	return (MLIB_SUCCESS);
}
Example #10
0
mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
                                  const mlib_image *src,
                                  mlib_s32         m,
                                  mlib_s32         n,
                                  mlib_s32         dm,
                                  mlib_s32         dn,
                                  const mlib_s32   *kern,
                                  mlib_s32         scale,
                                  mlib_s32         cmask)
{
  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
  mlib_d64 dd, d0, d1;
  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
  mlib_u8 *sl, *sp, *dl;
  mlib_s32 hgt = mlib_ImageGetHeight(src);
  mlib_s32 wid = mlib_ImageGetWidth(src);
  mlib_s32 sll = mlib_ImageGetStride(src);
  mlib_s32 dll = mlib_ImageGetStride(dst);
  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
  mlib_d64 *pbuff, *dp;
  mlib_f32 *karr = (mlib_f32 *) kern;
  mlib_s32 gsr_scale = (31 - scale) << 3;
  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
  mlib_s32 i, j, l, chan, testchan;
  mlib_s32 nchan = mlib_ImageGetChannels(dst);
  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);

  if (n > MAX_N) {
    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));

    if (buffs == NULL)
      return MLIB_FAILURE;
  }

  buff = buffs + 2 * (n + 1);

  adr_dst += dn * dll + dm * nchan;

  ssize = wid;
  dsize = (ssize + 7) / 8;
  esize = dsize + 4;
  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));

  if (pbuff == NULL) {
    if (buffs != buffs_local)
      mlib_free(buffs);
    return MLIB_FAILURE;
  }

  for (i = 0; i < (n + 1); i++)
    buffs[i] = pbuff + i * esize;
  for (i = 0; i < (n + 1); i++)
    buffs[(n + 1) + i] = buffs[i];
  buffd = buffs[n] + esize;
  buffe = buffd + 2 * esize;

  hgt -= (n - 1);
  xsize = ssize - (m - 1);

  vis_write_gsr(gsr_scale + 7);

  if (nchan == 2) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
  }
  else if (nchan == 3) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
  }
  else {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
  }

  testchan = 1;
  for (chan = 0; chan < nchan; chan++) {
    buff_ind = 0;
    sl = adr_src;
    dl = adr_dst;

    if ((cmask & testchan) == 0) {
      testchan <<= 1;
      continue;
    }

    for (l = 0; l < n; l++) {
      mlib_d64 *buffn = buffs[l];
      sp = sl + l * sll;

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
    }

    /* init buffer */
#pragma pipeloop(0)
    for (i = 0; i < (xsize + 7) / 8; i++) {
      buffd[2 * i] = drnd;
      buffd[2 * i + 1] = drnd;
    }

    for (j = 0; j < hgt; j++) {
      mlib_d64 **buffc = buffs + buff_ind;
      mlib_f32 *pk = karr, k0, k1, k2, k3;
      sp = sl + n * sll;

      for (l = 0; l < n; l++) {
        buff[l] = buffc[l];
      }

      buffn = buffc[n];

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);

      ik_last = (m - 1);

      for (jk = 0; jk < n; jk += jk_size) {
        jk_size = n - jk;

        if (jk_size >= 6)
          jk_size = 4;

        if (jk_size == 5)
          jk_size = 3;

        coff = 0;

        if (jk_size == 1) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];

            doff = coff / 8;
            buff0 = buff[jk] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s01 = buff0[i + 1];
              s0 = vis_faligndata(s00, s01);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d1 = vis_fpadd16(d01, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += m;
        }
        else if (jk_size == 2) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
            s11 = buff1[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s10 = s11;
              s01 = buff0[i + 1];
              s11 = buff1[i + 1];
              s0 = vis_faligndata(s00, s01);
              s1 = vis_faligndata(s10, s11);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d0 = vis_fpadd16(d10, d0);
              d1 = vis_fpadd16(d01, d1);
              d1 = vis_fpadd16(d11, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += 2 * m;
        }
        else if (jk_size == 3) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 3 * m;
        }
        else {                              /* jk_size == 4 */

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];
            k3 = pk[ik + 3 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;
            buff3 = buff[jk + 3] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {

#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];
                s3 = buff3[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);
                s3 = vis_faligndata(s30, s31);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 4 * m;
        }
      }

      /*****************************************
       *****************************************
       **          Final iteration            **
       *****************************************
       *****************************************/

      jk_size = n;

      if (jk_size >= 6)
        jk_size = 4;

      if (jk_size == 5)
        jk_size = 3;

      k0 = karr[ik_last];
      k1 = karr[ik_last + m];
      k2 = karr[ik_last + 2 * m];
      k3 = karr[ik_last + 3 * m];

      off = ik_last;
      doff = off / 8;
      off &= 7;
      buff0 = buff[0] + doff;
      buff1 = buff[1] + doff;
      buff2 = buff[2] + doff;
      buff3 = buff[3] + doff;
      vis_write_gsr(gsr_scale + off);

      if (jk_size == 1) {
        dp = buffe;

        s01 = buff0[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s01 = buff0[i + 1];
          s0 = vis_faligndata(s00, s01);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d1 = vis_fpadd16(d1, d01);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 2) {
        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 3) {

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else {                                /* if (jk_size == 4) */

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
        s31 = buff3[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s30 = s31;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s31 = buff3[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);
          s3 = vis_faligndata(s30, s31);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d0 = vis_fpadd16(d0, d30);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);
          d1 = vis_fpadd16(d1, d31);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }

      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);

      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= (n + 1))
        buff_ind = 0;
    }

    testchan <<= 1;
  }

  mlib_free(pbuff);

  if (buffs != buffs_local)
    mlib_free(buffs);

  return MLIB_SUCCESS;
}
mlib_status
__mlib_VideoUpSample420(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src0,
	const mlib_u8 *src1,
	const mlib_u8 *src2,
	mlib_s32 n)
{
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d10, d11, d20, d21;
	mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo;
	mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo;
	mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo;
	mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo;
	mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
	mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7;
	mlib_d64 data0, data1, data2, data3, tmp0, tmp1;
	mlib_f32 fone = vis_to_float(0x4000000);
	mlib_f32 fthree = vis_to_float(0xC000000);
	mlib_f32 fone1 = vis_to_float(0x40404040);
	mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0);
	mlib_d64 dseven = vis_to_double_dup(0x70007);
	mlib_d64 deight = vis_to_double_dup(0x80008);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((3 << 3) + 2);

	d00 = vis_ld_d64_nf(sp0);
	d10 = vis_ld_d64_nf(sp1);
	d20 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone);
	lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone);
	lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone);
	lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone);
	tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree);
	tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree);
	lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0);
	lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1);
	lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0);
	lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1);

#pragma pipeloop(0)
	for (i = 0; i < n - 8; i += 8) {
		d01 = *sp0;
		d11 = *sp1;
		d21 = *sp2;
		sp0++;
		sp1++;
		sp2++;

		thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone);
		thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone);
		thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone);
		thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone);

		tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree);
		tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree);

		thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0);
		thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1);
		thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0);
		thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1);

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, thiscolsum0_hi);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, thiscolsum1_hi);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0));
		dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));
		dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2));

		dp0 += 2;
		dp1 += 2;
		lastcolsum0_hi = thiscolsum0_hi;
		lastcolsum0_lo = thiscolsum0_lo;
		lastcolsum1_hi = thiscolsum1_hi;
		lastcolsum1_lo = thiscolsum1_lo;
	}

	if (i < n) {

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, lastcolsum0_lo);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, lastcolsum1_lo);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));

		emask = vis_edge8(dp0, dend0);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			acc0 = vis_fpmerge(vis_read_lo(data1),
				vis_read_lo(data0));
			acc1 = vis_fpmerge(vis_read_lo(data3),
				vis_read_lo(data2));

			emask = vis_edge8(dp0, dend0);
			vis_pst_8(acc0, dp0, emask);
			vis_pst_8(acc1, dp1, emask);
		}
	}

	vis_write_gsr(7);

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;

	ac0 = *dp0;
	ac2 = *dp1;

#pragma pipeloop(0)
	for (i = 0; i < 2 * n - 8; i += 8) {
		ac1 = *dp0;
		ac3 = *dp1;
		*dp0 = vis_faligndata(ac0, ac1);
		*dp1 = vis_faligndata(ac2, ac3);
		dp0++;
		dp1++;
		ac0 = ac1;
		ac2 = ac3;
	}

	if (i < 2 * n) {
		ac1 = vis_ld_d64_nf(dp0);
		ac3 = vis_ld_d64_nf(dp1);
		emask = vis_edge8(dp0, dend0);
		acc0 = vis_faligndata(ac0, ac1);
		acc1 = vis_faligndata(ac2, ac3);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv5x5_8nw_mask(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *dend, *adr_src;

/* pointers to src rows */
	mlib_u8 *sa, *sa2, *sa3, *sa4, *sa5, *sa6, *sa_6, *prow;

/* pointers to rows in interm. src buf */
	mlib_u8 *buff_src, *sbuf1, *sbuf2, *sbuf3, *sbuf4, *sbuf5, *s_buf1;

/* pointers to row in interm. dst buf */
	mlib_u8 *dbuf, *d_buf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst, *ddst1;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, rnd;

/* data */
	mlib_d64 d1, d2, d3, d4, d5, d_1, d_2, d_3, d_4, d_5;

/* temp. data, used in faligndata */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;

/* shifted data */
	mlib_d64 d21, d22, d23, d24, d25;
	mlib_f32 k1k2, k17k18, k19k20, k21k22, k23k24, k25;
	mlib_f32 k3k4, k5k6, k7k8, k9k10, k11k12, k13k14, k15k16;
	mlib_s32 rval, gsr_scale, i, j, nchannel, nchannel1, chan, testchan;

/* temp, used in load-store */
	mlib_s32 t1, t2, t3, t4, t5, t6, t7, t8, tt1, tt2, tt3, tt4, tt5, tt6,
	    tt7, tt8;

	adr_src = mlib_ImageGetData(src);
	adr_dst = mlib_ImageGetData(dst);
	nchannel = mlib_ImageGetChannels(src);
	slb = mlib_ImageGetStride(src);
	dlb = mlib_ImageGetStride(dst);
	dh = mlib_ImageGetHeight(dst);
	dw = mlib_ImageGetWidth(dst);

/* buf_slb - 8-byte aligned */
	buf_slb = (dw + 16) & (~7);
/* alloc. interm. src and dst buffer */
	buff_src = (mlib_u8 *)__mlib_malloc(7 * buf_slb * sizeof (mlib_u8) + 8);

	if (buff_src == NULL)
		return (MLIB_FAILURE);
/* edge - no write */
	dw -= 4;
	dh -= 4;

/*
 * The 8x16 mult has built-in 8-bit R shift, and fpack16 has 7-bit
 * fixed R shift (preceded by variable-bit L shift controlled by GSR
 * scalefactor field). Thus net R shift = (8+7)-(GSR.scalefactor_field),
 * so GSR.scalefactor_field = 15-(net R shift):
 */
	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3) + 1);
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));
	sbuf1 = (mlib_u8 *)((mlib_addr)(buff_src + 8) & (~7));
	sbuf2 = sbuf1 + buf_slb;
	sbuf3 = sbuf2 + buf_slb;
	sbuf4 = sbuf3 + buf_slb;
	sbuf5 = sbuf4 + buf_slb;
	dbuf = sbuf5 + buf_slb;

	LOAD_KERNEL_INTO_FLOAT();

	testchan = 1;

	for (chan = nchannel - 1; chan >= 0; chan--) {
		if ((cmask & testchan) == 0) {
			testchan <<= 1;
			continue;
		}

		testchan <<= 1;
		sa = adr_src + chan;
		sa2 = sa + slb;
		sa3 = sa2 + slb;
		sa4 = sa3 + slb;
		sa5 = sa4 + slb;
		sa_6 = sa6 = sa5 + slb;
		d_a = adr_dst + (dlb << 1) + (nchannel << 1) + chan;

/* load interm. src buff */
		for (i = 0, j = 0; j < (dw + 4); i += nchannel, j++) {
			sbuf1[j] = sa5[i];
			sbuf2[j] = sa[i];
			sbuf3[j] = sa2[i];
			sbuf4[j] = sa3[i];
			sbuf5[j] = sa4[i];
		}

		for (j = 0; j < dh - 1; j++) {
			ddst1 = ddst = (mlib_d64 *)(dbuf);
			d_buf = (dbuf - 8);
			da = d_a;
			dend = da + (dw - 1) * nchannel;
			prow = sbuf1;
			sbuf1 = sbuf2;
			sbuf2 = sbuf3;
			sbuf3 = sbuf4;
			sbuf4 = sbuf5;
			sbuf5 = prow;
			s1 = (mlib_d64 *)sbuf1;
			s2 = (mlib_d64 *)sbuf2;
			s3 = (mlib_d64 *)sbuf3;
			s4 = (mlib_d64 *)sbuf4;
			s5 = (mlib_d64 *)sbuf5;
			s_buf1 = sbuf1;
			d1 = *s1;
			d2 = *s2;
			d3 = *s3;
			nchannel1 = 0;

#pragma pipeloop(0)
			for (i = 0; i < dw; i += 8) {
				d_1 = *(s1 + 1);
				d_2 = *(s2 + 1);
				d_3 = *(s3 + 1);
				out0 = out1 = rnd;
				t1 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AU(d1, k1k2);
				t2 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AL(d2, k5k6);
				t3 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AU(d3, k11k12);
				t4 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d21 = vis_faligndata(d1, d_1);
				dt_1 = vis_faligndata(d_1, d1);
				t5 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d22 = vis_faligndata(d2, d_2);
				dt_2 = vis_faligndata(d_2, d2);
				t6 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				d23 = vis_faligndata(d3, d_3);
				t7 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				dt_3 = vis_faligndata(d_3, d3);
				t8 = vis_ld_u8_nf(sa_6);
				sa_6 += nchannel;
				CONV_AL(d21, k1k2);
				(*s_buf1++) = t1;
				CONV_AU(d22, k7k8);
				(*s_buf1++) = t2;
				CONV_AL(d23, k11k12);
				(*s_buf1++) = t3;
				SHIFT_U8_1;
				CONV_AU(d21, k3k4);
				(*s_buf1++) = t4;
				CONV_AL(d22, k7k8);
				CONV_AU(d23, k13k14);
				d21 = vis_faligndata(d21, dt_1);
				d22 = vis_faligndata(d22, dt_2);
				(*s_buf1++) = t5;
				d23 = vis_faligndata(d23, dt_3);
				CONV_AL(d21, k3k4);
				(*s_buf1++) = t6;
				CONV_AU(d22, k9k10);
				(*s_buf1++) = t7;
				CONV_AL(d23, k13k14);
				d21 =
				    vis_freg_pair(vis_read_lo(d1),
				    vis_read_hi(d_1));
				CONV_AU(d21, k5k6);
				d22 =
				    vis_freg_pair(vis_read_lo(d2),
				    vis_read_hi(d_2));
				CONV_AL(d22, k9k10);
				d23 =
				    vis_freg_pair(vis_read_lo(d3),
				    vis_read_hi(d_3));
				CONV_AU(d23, k15k16);
				(*s_buf1++) = t8;
				ddst[0] = out0;
				ddst[1] = out1;
				ddst += 2;
				d1 = d_1;
				d2 = d_2;
				d3 = d_3;
				s1++;
				s2++;
				s3++;
			}

			ddst = (mlib_d64 *)(dbuf);
			d4 = *s4;
			d5 = *s5;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
			for (i = 0; i < dw; i += 8) {
				d_4 = *(s4 + 1);
				d_5 = *(s5 + 1);
				out0 = ddst[0];
				out1 = ddst[1];
				ddst += 2;
				tt1 = (*d_buf++);
				CONV_AL(d4, k15k16);
				tt2 = (*d_buf++);
				CONV_AU(d5, k21k22);
				d24 = vis_faligndata(d4, d_4);
				tt3 = (*d_buf++);
				dt_4 = vis_faligndata(d_4, d4);
				d25 = vis_faligndata(d5, d_5);
				tt4 = (*d_buf++);
				dt_5 = vis_faligndata(d_5, d5);
				tt5 = (*d_buf++);
				CONV_AU(d24, k17k18);
				tt6 = (*d_buf++);
				CONV_AL(d25, k21k22);
				tt7 = (*d_buf++);
				SHIFT_U8_2;
				tt8 = (*d_buf++);
				CONV_AL(d24, k17k18);
				*da = tt1;
				da += nchannel1;
				CONV_AU(d25, k23k24);
				*da = tt2;
				da += nchannel1;
				d24 = vis_faligndata(d24, dt_4);
				*da = tt3;
				da += nchannel1;
				d25 = vis_faligndata(d25, dt_5);
				*da = tt4;
				da += nchannel1;
				CONV_AU(d24, k19k20);
				*da = tt5;
				da += nchannel1;
				CONV_AL(d25, k23k24);
				*da = tt6;
				da += nchannel1;
				d24 =
				    vis_freg_pair(vis_read_lo(d4),
				    vis_read_hi(d_4));
				CONV_AL(d24, k19k20);
				*da = tt7;
				da += nchannel1;
				d25 =
				    vis_freg_pair(vis_read_lo(d5),
				    vis_read_hi(d_5));
				CONV_AU(d25, k25);
				*da = tt8;
				da += nchannel1;
				(*ddst1++) = vis_fpack16_pair(out0, out1);
				d4 = d_4;
				d5 = d_5;
				s4++;
				s5++;
				nchannel1 = nchannel;
			}

			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);
			sa_6 += nchannel;
			(*s_buf1++) = vis_ld_u8_nf(sa_6);

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = (*d_buf++);
			}

			sa_6 = sa6 = sa6 + slb;
			d_a += dlb;
		}

/* process last row - no need to load data */
		ddst1 = ddst = (mlib_d64 *)(dbuf);
		d_buf = (dbuf - 8);
		da = d_a;
		dend = da + (dw - 1) * nchannel;
		prow = sbuf1;
		sbuf1 = sbuf2;
		sbuf2 = sbuf3;
		sbuf3 = sbuf4;
		sbuf4 = sbuf5;
		sbuf5 = prow;
		s1 = (mlib_d64 *)sbuf1;
		s2 = (mlib_d64 *)sbuf2;
		s3 = (mlib_d64 *)sbuf3;
		s4 = (mlib_d64 *)sbuf4;
		s5 = (mlib_d64 *)sbuf5;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		nchannel1 = 0;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			d21 = vis_faligndata(d1, d_1);
			dt_1 = vis_faligndata(d_1, d1);
			d22 = vis_faligndata(d2, d_2);
			dt_2 = vis_faligndata(d_2, d2);
			d23 = vis_faligndata(d3, d_3);
			dt_3 = vis_faligndata(d_3, d3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k7k8);
			CONV_AL(d23, k11k12);
			SHIFT_U8_1;
			CONV_AU(d21, k3k4);
			CONV_AL(d22, k7k8);
			CONV_AU(d23, k13k14);
			d21 = vis_faligndata(d21, dt_1);
			d22 = vis_faligndata(d22, dt_2);
			d23 = vis_faligndata(d23, dt_3);
			CONV_AL(d21, k3k4);
			CONV_AU(d22, k9k10);
			CONV_AL(d23, k13k14);
			d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1));
			CONV_AU(d21, k5k6);
			d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2));
			CONV_AL(d22, k9k10);
			d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3));
			CONV_AU(d23, k15k16);
			ddst[0] = out0;
			ddst[1] = out1;
			ddst += 2;
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = (mlib_d64 *)(dbuf);
		d4 = *s4;
		d5 = *s5;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_4 = *(s4 + 1);
			d_5 = *(s5 + 1);
			out0 = ddst[0];
			out1 = ddst[1];
			ddst += 2;
			tt1 = (*d_buf++);
			CONV_AL(d4, k15k16);
			tt2 = (*d_buf++);
			CONV_AU(d5, k21k22);
			d24 = vis_faligndata(d4, d_4);
			tt3 = (*d_buf++);
			dt_4 = vis_faligndata(d_4, d4);
			d25 = vis_faligndata(d5, d_5);
			tt4 = (*d_buf++);
			dt_5 = vis_faligndata(d_5, d5);
			tt5 = (*d_buf++);
			CONV_AU(d24, k17k18);
			tt6 = (*d_buf++);
			CONV_AL(d25, k21k22);
			tt7 = (*d_buf++);
			SHIFT_U8_2;
			tt8 = (*d_buf++);
			CONV_AL(d24, k17k18);
			*da = tt1;
			da += nchannel1;
			CONV_AU(d25, k23k24);
			*da = tt2;
			da += nchannel1;
			d24 = vis_faligndata(d24, dt_4);
			*da = tt3;
			da += nchannel1;
			d25 = vis_faligndata(d25, dt_5);
			*da = tt4;
			da += nchannel1;
			CONV_AU(d24, k19k20);
			*da = tt5;
			da += nchannel1;
			CONV_AL(d25, k23k24);
			*da = tt6;
			da += nchannel1;
			d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4));
			CONV_AL(d24, k19k20);
			*da = tt7;
			da += nchannel1;
			d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5));
			CONV_AU(d25, k25);
			*da = tt8;
			da += nchannel1;
			(*ddst1++) = vis_fpack16_pair(out0, out1);
			d4 = d_4;
			d5 = d_5;
			s4++;
			s5++;
			nchannel1 = nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = (*d_buf++);
		}
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv5x5_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3, *sbuf4, *sbuf5;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf, *dbuf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d3, d4, d5;

/* data */
	mlib_d64 d11, d12, d13, d14, d15;

/* data */
	mlib_d64 d21, d22, d23, d24, d25;

/* data */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;
	mlib_f32 k1k2, k3k4, k5k6, k7k8;
	mlib_f32 k9k10, k11k12, k13k14, k15k16;
	mlib_f32 k17k18, k19k20, k21k22, k23k24, k25;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 4;
	dw *= 4;
	dh -= 4;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	sa3 = sa2 + slb;
	sa4 = sa3 + slb;
	d_a = adr_dst + 2 * dlb + 8;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf4, sa2);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf5, sa3);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf5, sa4);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER_NF(16);

		vis_alignaddr(s1, 4);
		dbuf1 = dbuf;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		d11 = *(s1 + 1);
		d12 = *(s2 + 1);
		d13 = *(s3 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d21 = *(s1 + 2);
			d22 = *(s2 + 2);
			d23 = *(s3 + 2);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			dt_1 = vis_faligndata(d1, d11);
			dt_2 = vis_faligndata(d2, d12);
			dt_3 = vis_faligndata(d3, d13);
			CONV_AL(dt_1, k1k2);
			CONV_AU(dt_2, k7k8);
			CONV_AL(dt_3, k11k12);
			CONV_AU(d11, k3k4);
			CONV_AL(d12, k7k8);
			CONV_AU(d13, k13k14);
			dt_1 = vis_faligndata(d11, d21);
			dt_2 = vis_faligndata(d12, d22);
			dt_3 = vis_faligndata(d13, d23);
			CONV_AL(dt_1, k3k4);
			CONV_AU(dt_2, k9k10);
			CONV_AL(dt_3, k13k14);
			CONV_AU(d21, k5k6);
			CONV_AL(d22, k9k10);
			CONV_AU(d23, k15k16);
			dbuf1[0] = out0;
			dbuf1[1] = out1;
			dbuf1 += 2;
			d1 = d11;
			d2 = d12;
			d3 = d13;
			d11 = d21;
			d12 = d22;
			d13 = d23;
			s1++;
			s2++;
			s3++;
		}

		dbuf1 = dbuf;
		d4 = *s4;
		d5 = *s5;
		d14 = *(s4 + 1);
		d15 = *(s5 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d24 = *(s4 + 2);
			d25 = *(s5 + 2);
			out0 = dbuf1[0];
			out1 = dbuf1[1];
			CONV_AL(d4, k15k16);
			CONV_AU(d5, k21k22);
			dt_4 = vis_faligndata(d4, d14);
			dt_5 = vis_faligndata(d5, d15);
			CONV_AU(dt_4, k17k18);
			CONV_AL(dt_5, k21k22);
			CONV_AL(d14, k17k18);
			CONV_AU(d15, k23k24);
			dt_4 = vis_faligndata(d14, d24);
			dt_5 = vis_faligndata(d15, d25);
			CONV_AU(dt_4, k19k20);
			CONV_AL(dt_5, k23k24);
			CONV_AL(d24, k19k20);
			CONV_AU(d25, k25);
			dbuf1 += 2;
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d4 = d14;
			d5 = d15;
			d14 = d24;
			d15 = d25;
			s4++;
			s5++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa4 = sa4 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2ABGR411_dst_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy0, dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 dd, dd0, dd1, dtmp;

/* used to load u, v into mlib_f32 */
	mlib_f32 ffu[1], ffv[1];

/* used to load u, v into mlib_f32 */
	mlib_u8 *ufu, *vfu;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_d64 *buf;
	mlib_s32 inc;

	ufu = (mlib_u8 *)ffu;
	vfu = (mlib_u8 *)ffv;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(3 << 3);

	buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64));

	if (buf == NULL)
		return (MLIB_FAILURE);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = buf;
		dy0 = vis_ld_d64_nf(spy); spy++;

#pragma pipeloop(0)
		for (i = 0; i < width; i += 8) {
			dy1 = vis_ld_d64_nf(spy); spy++;
			(*dpp++) = vis_faligndata(dy0, dy1);
			dy0 = dy1;
		}

		spy = buf;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);

		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		vis_alignaddr((void *)(8 - i), 0);
		inc = (emask1 != 0xff);
		emask1 &= emask;

		ufu[0] = vis_ld_u8_nf(sp2);
		ufu[1] = vis_ld_u8_nf(sp2 + 1);
		ufu[2] = vis_ld_u8_nf(sp2 + 2);
		ufu[3] = vis_ld_u8_nf(sp2 + 3);
		vfu[0] = vis_ld_u8_nf(sp3);
		vfu[1] = vis_ld_u8_nf(sp3 + 1);
		vfu[2] = vis_ld_u8_nf(sp3 + 2);
		vfu[3] = vis_ld_u8_nf(sp3 + 3);
		sp2 += 4;
		sp3 += 4;

		fu = ffu[0];
		fv = ffv[0];

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 16; i += 16) {

			dy1 = (*spy++);
			dy2 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = vis_ld_u8_nf(sp2);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);
			ufu[1] = vis_ld_u8_nf(sp2 + 1);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			ufu[2] = vis_ld_u8_nf(sp2 + 2);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);
			ufu[3] = vis_ld_u8_nf(sp2 + 3);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);
			vfu[0] = vis_ld_u8_nf(sp3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);
			vfu[1] = vis_ld_u8_nf(sp3 + 1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);
			vfu[2] = vis_ld_u8_nf(sp3 + 2);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);
			vfu[3] = vis_ld_u8_nf(sp3 + 3);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
			fu = ffu[0];
			fv = ffv[0];
			sp2 += 4;
			sp3 += 4;
			emask1 = emask;
		}

		if (i <= width - 8) {

			dy1 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = ufu[2];

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			vfu[0] = vfu[2];

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr = vis_fpack16_pair(dr1, dr2);
			dg = vis_fpack16_pair(dg1, dg2);
			db = vis_fpack16_pair(db1, db2);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			fu = ffu[0];
			fv = ffv[0];

			i += 8;
			emask1 = emask;
		}

		if (i < width) {

			dy1 = vis_ld_d64_nf(spy);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			fu = vis_fpack16(db1);

			dg2 = vis_fpmerge(fu, vis_fpack16(dg1));
			dg3 = vis_fpmerge(fu, vis_fpack16(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
		}

		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}
	__mlib_free(buf);
	return (MLIB_SUCCESS);
}
DEF_FUNC(mlib_ImageDivAlpha_U8, mlib_u8)
{
	mlib_d64 mask7FFF = vis_to_double_dup(0x7FFF7FFF);
	mlib_d64 *p_tbl;
	mlib_d64 *buffs, *buffd;
	mlib_d64 *sp, *dp;
	mlib_d64 ss, d0, d1, dd, a0, a1;
	mlib_s32 cmask = (1 << (channel - alpha - 1));
	mlib_s32 ww, dflag, i, j;

	vis_write_gsr(7 << 3);
	cmask |= (cmask << channel);
	cmask |= (cmask << 2 * channel);

	if (channel == 3) {
		p_tbl = (mlib_d64 *)mlib_DivAlpha_tbl;
	} else {
		p_tbl = (mlib_d64 *)mlib_DivAlpha_tbl4 + alpha * 256;
	}

	width *= channel;
	ww = (width + 7) / 8;

	if (channel == 3) {
		ww = 3 * ((ww + 2) / 3);
	}

	buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww);

	if (buffs == NULL) {
		return (MLIB_FAILURE);
	}

	buffd = buffs + ww;

	for (j = 0; j < height; j++) {
		mlib_u8 *ap = sl + alpha;

		if (((int)sl & 7)) {
			MEM_COPY(sl, buffs, width * sizeof (mlib_u8));
			sp = buffs;
		} else {
			sp = (mlib_d64 *)sl;
		}

		dflag = 0;

		if (((int)dl | width) & 7) {
			dp = buffd;
			dflag = 1;
		} else {
			dp = (mlib_d64 *)dl;
		}

		if (channel == 4) {
#pragma pipeloop(0)
			for (i = 0; i < ww; i++) {
				ss = *sp;
				GET_ALPHA(a0, sp, alpha);
				GET_ALPHA(a1, sp, alpha + 4);
				DIV_ALPHA(d0, vis_read_hi(ss), a0);
				DIV_ALPHA(d1, vis_read_lo(ss), a1);
				*dp = vis_fpack16_pair(d0, d1);
				sp++;
				dp++;
			}

		} else if (channel == 3) {
			mlib_d64 a0, a1, a2, aa;
			mlib_d64 b0, b1, b2, bb;
			mlib_d64 s0, s1, s2;
			mlib_d64 d0, d1;
			mlib_s32 cmask0, cmask1, cmask2;

			cmask0 = 0x492 >> alpha;
			cmask1 = 0x492 >> (alpha + 1);
			cmask2 = 0x492 >> (alpha + 2);

			vis_alignaddr((void *)0, 4);

			if (alpha == 0) {
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					GET_ALPHA_3CH_0();
					DIV_ALPHA_3CH();
				}

				if (i < ww) {
					GET_ALPHA_3CH_0_NF();
					DIV_ALPHA_3CH_NF();
				}

			} else if (alpha == 1) {
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					GET_ALPHA_3CH_1();
					DIV_ALPHA_3CH();
				}


				if (i < ww) {
					GET_ALPHA_3CH_1_NF();
					DIV_ALPHA_3CH_NF();
				}

			} else {	/* if (alpha == 2) */

#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					GET_ALPHA_3CH_2();
					DIV_ALPHA_3CH();
				}

				if (i < ww) {
					GET_ALPHA_3CH_2_NF();
					DIV_ALPHA_3CH_NF();
				}
			}

		} else {	/* if (channel == 2) */

#pragma pipeloop(0)
			for (i = 0; i < ww; i++) {
static mlib_status
mlib_v_VideoColorYUV2RGB444_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 yuv_stride)
{
/* all. pointer to y, u, v */
	mlib_d64 *spy, *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2;

/* loop variable */
	mlib_s32 i, j;
	mlib_d64 *buf, BUFF[16 * 1024];
	mlib_u8 *tmp, *dp;

	if (width * 3 > 16 * 1024) {
		tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);

		if (tmp == NULL)
			return (MLIB_FAILURE);
		buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
	} else {
		buf = (mlib_d64 *)BUFF;
	}

	dp = (mlib_u8 *)buf;
	ddp = (mlib_d64 *)dp;

	for (j = 0; j < height; j++) {

		dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0);
		du0 = (*dfu++);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

		dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0);
		dv0 = (*dfv++);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		spy = (mlib_d64 *)vis_alignaddr((void *)y, 0);
		dy0 = (*spy++);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

		vis_alignaddr((void *)u, 0);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

		vis_alignaddr((void *)v, 0);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		g_hi = vis_fpadd16(g_hi, y_11644_hi);

		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
		g_lo = vis_fpadd16(g_lo, y_11644_lo);

		green = vis_fpack16_pair(g_hi, g_lo);
		b_hi = vis_fpadd16(b_hi, y_11644_hi);
		b_lo = vis_fpadd16(b_lo, y_11644_lo);

		blue = vis_fpack16_pair(b_hi, b_lo);
		r_hi = vis_fpadd16(r_hi, y_11644_hi);
		r_lo = vis_fpadd16(r_lo, y_11644_lo);

		red = vis_fpack16_pair(r_hi, r_lo);

		vis_alignaddr((void *)y, 0);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd0 = vis_bshuffle(red, green);
			vis_write_bmask(0x03B04C05, 0);
			dd1 = vis_bshuffle(red, green);
			vis_write_bmask(0xD06E07F0, 0);
			dd2 = vis_bshuffle(red, green);
			vis_write_bmask(0x01834967, 0);
			ddp[0] = vis_bshuffle(dd0, blue);
			vis_write_bmask(0xA12B45C7, 0);
			ddp[1] = vis_bshuffle(dd1, blue);
			vis_write_bmask(0x0D23E56F, 0);
			ddp[2] = vis_bshuffle(dd2, blue);

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);
			vis_alignaddr((void *)u, 0);
			du1 = vis_ld_d64_nf(dfu); dfu++;
			du = vis_faligndata(du0, du1);
			du0 = du1;

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);
			vis_alignaddr((void *)v, 0);
			dv1 = vis_ld_d64_nf(dfv); dfv++;
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			g_hi = vis_fpadd16(g_hi, y_11644_hi);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			g_lo = vis_fpadd16(g_lo, y_11644_lo);

			green = vis_fpack16_pair(g_hi, g_lo);
			b_hi = vis_fpadd16(b_hi, y_11644_hi);
			b_lo = vis_fpadd16(b_lo, y_11644_lo);

			blue = vis_fpack16_pair(b_hi, b_lo);
			r_hi = vis_fpadd16(r_hi, y_11644_hi);
			r_lo = vis_fpadd16(r_lo, y_11644_lo);

			red = vis_fpack16_pair(r_hi, r_lo);

			vis_alignaddr((void *)y, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			ddp += 3;
		}

		dp = (mlib_u8 *)ddp;

		vis_alignaddr((void *)(width - i), 0);
		blue = vis_faligndata(blue, blue);
		green = vis_faligndata(green, green);
		red = vis_faligndata(red, red);
		dp += ((width - i - 1) * 3);

		vis_alignaddr((void *)spy, 7);
		for (; i < width; i++) {
			STORE_PIXEL(0, 1, 2);
			dp -= 3;
		}

		__mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3);

		rgb += rgb_stride;
		dp = (mlib_u8 *)buf;
		ddp = (mlib_d64 *)dp;
		y += yuv_stride;
		u += yuv_stride;
		v += yuv_stride;
	}

	if (width * 3 > 16 * 1024)
		__mlib_free(tmp);
	return (MLIB_SUCCESS);
}
Example #17
0
mlib_status
__mlib_VectorConvert_U8_S8_Sat(
	mlib_u8 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i, off;
	mlib_s8 c;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 zero = vis_fzeros();

	if (length < 16) {
		PACK_S_U(mlib_s8, mlib_u8);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) < 0 ? 0 : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		off = (mlib_addr)src & 7;
		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}

/*
 * Then loop with step==2.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8sux16(d6, four_16_ones);
			d6 = vis_faligndata(d6, d6);
			d5 = vis_fmul8sux16(d6, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c;

	return (MLIB_SUCCESS);
}
mlib_status FUNC(
    MxN) (
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 **dmask,
    mlib_s32 m,
    mlib_s32 n,
    mlib_s32 scale,
    const void *colormap)
{
	mlib_type stype, dtype;
	const mlib_s32 *dmask0 = dmask[0], *dmask1 = dmask[1], *dmask2 =
	    dmask[2];
	mlib_s32 method = mlib_ImageGetMethod(colormap);
	mlib_u8 *sl, *dl;
	mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh, num_blk;
	mlib_s32 off, off1, kw, mstep, line_size, kern_size, xsize8, i, j, k;
	mlib_d64 *pbuff;
	mlib_u8 *p_dim;
	mlib_s16 *kern, *pkern;
	mlib_d64 *dkern;
	mlib_d64 dscale, dscale0, dscale1, dscale2;
	mlib_d64 ss, d0, d1;
	mlib_f32 fzeros = vis_fzeros();
	mlib_s32 step0, half_step0, v0;
	mlib_s32 bit_offset = mlib_ImageGetBitOffset(dst);
	mlib_u8 *p_lut;

	MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl);
	MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl);

	p_lut = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap);
	step0 = abs(p_lut[1] - p_lut[0]);

	num_blk = (sw + (m - 1)) / m;
	mstep = m * NCHAN;
	line_size = (mstep * num_blk + 7) & ~7;
	xsize8 = (NCHAN * sw + 7) / 8;

	dscale = 1.0;
	while (scale > 30) {
		dscale *= 1.0 / (1 << 30);
		scale -= 30;
	}

	dscale /= (1 << scale);

	dscale0 = dscale * step0;
	half_step0 = (step0 - 1) >> 1;

	kern_size = n * line_size;
	kern = __mlib_malloc(kern_size * sizeof (mlib_s16));

	if (kern == NULL)
		return (MLIB_FAILURE);

	for (j = 0; j < n; j++) {
		for (i = 0; i < m; i++) {
			pkern = kern + j * line_size + i;
			v0 = half_step0 - (mlib_s32)(dmask0[j * m +
			    i] * dscale0);
			for (k = 0; k < num_blk; k++) {
				pkern[k * mstep] = v0;
			}
		}
	}

	pbuff = __mlib_malloc(xsize8 * sizeof (mlib_d64) + 16);

	if (pbuff == NULL) {
		__mlib_free(kern);
		return (MLIB_FAILURE);
	}

	pkern = kern;

	vis_write_gsr(7 << 3);

	for (j = 0; j < sh; j++) {
		dkern = (mlib_d64 *)pkern;

		if ((mlib_s32)sl & 7) {
			mlib_u8 *sp = sl;

#pragma pipeloop(0)
			for (i = 0; i < xsize8; i++) {
				LOAD_NA_NF(ss, sp);
				d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_hi(ss)), dkern[2 * i]);
				d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_lo(ss)), dkern[2 * i + 1]);
				pbuff[i] = vis_fpack16_pair(d0, d1);
				sp += 8;
			}

		} else {
			mlib_d64 *sp = (mlib_d64 *)sl;

#pragma pipeloop(0)
			for (i = 0; i < xsize8; i++) {
				ss = sp[i];
				d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_hi(ss)), dkern[2 * i]);
				d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(),
				    vis_read_lo(ss)), dkern[2 * i + 1]);
				pbuff[i] = vis_fpack16_pair(d0, d1);
			}
		}

		pkern += line_size;

		if (pkern >= kern + kern_size)
			pkern = kern;

		mlib_ImageColorTrue2IndexLine_U8_BIT_1((mlib_u8 *)pbuff, dl,
		    bit_offset, sw, colormap);

		sl += sll;
		dl += dll;
	}

	__mlib_free(pbuff);
	__mlib_free(kern);

	return (MLIB_SUCCESS);
}