void
mlib_v_ImageAffineTableLine_8nw_3_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;
	mlib_d64 yFilter2;
	mlib_d64 yFilter3;
	mlib_d64 row20, row30;
	mlib_d64 *dpSrc;
	mlib_d64 data0, data1, zero;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

	zero = vis_to_double_dup(0);

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_3x2;
		FILTER_MERGE_4x2;
		MAKE_4x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, yFilter);
		xFilter = vis_write_hi(xFilter, fx0);
		LOAD_PIXEL_3;

		v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter));
		v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter));
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter));
		sum = vis_fpadd16(v0, sum);

		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
Ejemplo n.º 2
0
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_f32 *pd = (mlib_f32 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp0, tmp1, data;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, bias = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);

#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {
		d0 = (*sp0++);
		tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0));
		tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0));

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16(data);
	}

	dst = (mlib_u8 *)pd;

	for (; i < n; i += 2) {
		(*dst++) = (src[i] + src[i + 1] + bias) >> 1;
/* 1=>2, 2=>1 */
		bias ^= 1;
	}

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageAffineTableLine_8nw_2_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_2x2(row00, row10);
		FILTER_MERGE;
		MAKE_2x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

#pragma pipeloop(0)
	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, fy0);
		xFilter = vis_write_lo(xFilter, fx0);

		row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1));
		row10 =
		    vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr,
		    srcStride + 1));

		v0 = vis_fmul8x16au(vis_read_lo(row00), fy0);
		v1 = vis_fmul8x16al(vis_read_lo(row10), fy0);
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
Ejemplo n.º 4
0
static mlib_status
mlib_v_VideoColorYUV2RGB444_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 yuv_stride)
{
/* all. pointer to y, u, v */
	mlib_d64 *spy, *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2;

/* loop variable */
	mlib_s32 i, j;
	mlib_d64 *buf, BUFF[16 * 1024];
	mlib_u8 *tmp, *dp;

	if (width * 3 > 16 * 1024) {
		tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);

		if (tmp == NULL)
			return (MLIB_FAILURE);
		buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
	} else {
		buf = (mlib_d64 *)BUFF;
	}

	dp = (mlib_u8 *)buf;
	ddp = (mlib_d64 *)dp;

	for (j = 0; j < height; j++) {

		dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0);
		du0 = (*dfu++);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

		dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0);
		dv0 = (*dfv++);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		spy = (mlib_d64 *)vis_alignaddr((void *)y, 0);
		dy0 = (*spy++);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

		vis_alignaddr((void *)u, 0);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

		vis_alignaddr((void *)v, 0);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		g_hi = vis_fpadd16(g_hi, y_11644_hi);

		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
		g_lo = vis_fpadd16(g_lo, y_11644_lo);

		green = vis_fpack16_pair(g_hi, g_lo);
		b_hi = vis_fpadd16(b_hi, y_11644_hi);
		b_lo = vis_fpadd16(b_lo, y_11644_lo);

		blue = vis_fpack16_pair(b_hi, b_lo);
		r_hi = vis_fpadd16(r_hi, y_11644_hi);
		r_lo = vis_fpadd16(r_lo, y_11644_lo);

		red = vis_fpack16_pair(r_hi, r_lo);

		vis_alignaddr((void *)y, 0);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd0 = vis_bshuffle(red, green);
			vis_write_bmask(0x03B04C05, 0);
			dd1 = vis_bshuffle(red, green);
			vis_write_bmask(0xD06E07F0, 0);
			dd2 = vis_bshuffle(red, green);
			vis_write_bmask(0x01834967, 0);
			ddp[0] = vis_bshuffle(dd0, blue);
			vis_write_bmask(0xA12B45C7, 0);
			ddp[1] = vis_bshuffle(dd1, blue);
			vis_write_bmask(0x0D23E56F, 0);
			ddp[2] = vis_bshuffle(dd2, blue);

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);
			vis_alignaddr((void *)u, 0);
			du1 = vis_ld_d64_nf(dfu); dfu++;
			du = vis_faligndata(du0, du1);
			du0 = du1;

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);
			vis_alignaddr((void *)v, 0);
			dv1 = vis_ld_d64_nf(dfv); dfv++;
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			g_hi = vis_fpadd16(g_hi, y_11644_hi);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			g_lo = vis_fpadd16(g_lo, y_11644_lo);

			green = vis_fpack16_pair(g_hi, g_lo);
			b_hi = vis_fpadd16(b_hi, y_11644_hi);
			b_lo = vis_fpadd16(b_lo, y_11644_lo);

			blue = vis_fpack16_pair(b_hi, b_lo);
			r_hi = vis_fpadd16(r_hi, y_11644_hi);
			r_lo = vis_fpadd16(r_lo, y_11644_lo);

			red = vis_fpack16_pair(r_hi, r_lo);

			vis_alignaddr((void *)y, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			ddp += 3;
		}

		dp = (mlib_u8 *)ddp;

		vis_alignaddr((void *)(width - i), 0);
		blue = vis_faligndata(blue, blue);
		green = vis_faligndata(green, green);
		red = vis_faligndata(red, red);
		dp += ((width - i - 1) * 3);

		vis_alignaddr((void *)spy, 7);
		for (; i < width; i++) {
			STORE_PIXEL(0, 1, 2);
			dp -= 3;
		}

		__mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3);

		rgb += rgb_stride;
		dp = (mlib_u8 *)buf;
		ddp = (mlib_d64 *)dp;
		y += yuv_stride;
		u += yuv_stride;
		v += yuv_stride;
	}

	if (width * 3 > 16 * 1024)
		__mlib_free(tmp);
	return (MLIB_SUCCESS);
}
Ejemplo n.º 5
0
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
Ejemplo n.º 6
0
static mlib_status
mlib_v_VideoColorYUV2RGB411_nonalign(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *u,
    const mlib_u8 *v,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 rgb_stride,
    mlib_s32 y_stride,
    mlib_s32 uv_stride)
{
    /* pointers to src address */
    mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

    /* pointers to dst address */
    mlib_u8 *dp, *dl;

    /* all. pointer to y */
    mlib_d64 *spy;

    /* all. pointers to u, v */
    mlib_d64 *dfu, *dfv;

    /* u, v data */
    mlib_f32 fu, fv;

    /* y data */
    mlib_d64 dy0, dy1, dy2, dy3;
    mlib_d64 ddy1, ddy2, ddy3, ddy4;
    mlib_d64 du0, du1, fu0, fu1;
    mlib_d64 dv1, dv2, fv0, fv1;
    mlib_d64 dr, dr1, dr2, dr3, dr4;
    mlib_d64 dg, dg1, dg2, dg3, dg4;
    mlib_d64 db, db1, db2, db3, db4;
    mlib_d64 dtmp;

    /* 1.1644  * 4096 */
    mlib_f32 f0 = vis_to_float(0x12a1);

    /* 2.0184  * 8192 */
    mlib_f32 f1 = vis_to_float(0x4097);

    /* -0.3920 * 8192 */
    mlib_f32 f4 = vis_to_float(0xf375);

    /* -0.8132 * 8192 */
    mlib_f32 f5 = vis_to_float(0xe5fa);

    /* 1.5966  * 8192 */
    mlib_f32 f8 = vis_to_float(0x3317);

    /* -276.9856 * 32 */
    mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

    /* 135.6352  * 32 */
    mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

    /* -222.9952 * 32 */
    mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
    mlib_f32 fscale = vis_to_float(0x80808080);

    /* loop variable */
    mlib_s32 i, j;
    mlib_d64 *buf, BUFF[16 * 1024];
    mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22;
    mlib_u8 *tmp;

    if (width * 3 > 16 * 1024) {
        tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
        buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
    } else {
        buf = (mlib_d64 *)BUFF;
    }

    /*
     * initialize GSR scale factor
     */
    vis_write_gsr(3 << 3);

    sp1 = sl1 = (mlib_u8 *)y;
    sp2 = sl2 = (mlib_u8 *)u;
    sp3 = sl3 = (mlib_u8 *)v;

    dp = (mlib_u8 *)buf;
    dl = rgb;
    ddp = (mlib_d64 *)dp;

    /*
     * row loop
     */
    for (j = 0; j < height; j++) {
        spy = (mlib_d64 *)vis_alignaddr(sp1, 0);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = (*dfu++);
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = (*dfv++);
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        dy0 = (*spy++);
        dy3 = vis_ld_d64_nf(spy);
        spy++;
        vis_alignaddr(sp1, 0);
        dy1 = vis_faligndata(dy0, dy3);
        dy0 = vis_ld_d64_nf(spy);
        spy++;
        dy2 = vis_faligndata(dy3, dy0);

        du0 = vis_fmul8x16al(fu, f1);
        db = vis_fpadd16(du0, doff0);

        du1 = vis_fmul8x16al(fu, f4);
        dv1 = vis_fmul8x16al(fv, f5);
        dtmp = vis_fpadd16(du1, dv1);
        dg = vis_fpadd16(dtmp, doff1);

        dv2 = vis_fmul8x16al(fv, f8);
        dr = vis_fpadd16(dv2, doff2);

        ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
        ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

        ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
        ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

        db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
        db1 = vis_fpadd16(ddy1, db1);

        db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
        db2 = vis_fpadd16(ddy2, db2);

        db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
        db3 = vis_fpadd16(ddy3, db3);

        db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
        db4 = vis_fpadd16(ddy4, db4);

        dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
        dg1 = vis_fpadd16(ddy1, dg1);

        dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
        dg2 = vis_fpadd16(ddy2, dg2);

        dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
        dg3 = vis_fpadd16(ddy3, dg3);

        dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
        dg4 = vis_fpadd16(ddy4, dg4);

        dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
        dr1 = vis_fpadd16(ddy1, dr1);

        dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
        dr2 = vis_fpadd16(ddy2, dr2);

        dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
        dr3 = vis_fpadd16(ddy3, dr3);

        dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
        dr4 = vis_fpadd16(ddy4, dr4);

        db = vis_fpack16_pair(db1, db2);
        db1 = vis_fpack16_pair(db3, db4);

        dr = vis_fpack16_pair(dr1, dr2);
        dr1 = vis_fpack16_pair(dr3, dr4);

        dg = vis_fpack16_pair(dg1, dg2);
        dg1 = vis_fpack16_pair(dg3, dg4);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = vis_ld_d64_nf(dfu);
        dfu++;
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = vis_ld_d64_nf(dfv);
        dfv++;
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        /*
         * 16-pixel column loop
         */
#pragma pipeloop(0)
        for (i = 0; i <= width - 16; i += 16) {

            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            dd02 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            dd12 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            dd22 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            ddp[3] = vis_bshuffle(dd02, db1);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            ddp[4] = vis_bshuffle(dd12, db1);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            ddp[5] = vis_bshuffle(dd22, db1);

            dy3 = vis_ld_d64_nf(spy);
            spy++;
            vis_alignaddr(sp1, 0);
            dy1 = vis_faligndata(dy0, dy3);
            dy0 = vis_ld_d64_nf(spy);
            spy++;
            dy2 = vis_faligndata(dy3, dy0);

            du0 = vis_fmul8x16al(fu, f1);
            db = vis_fpadd16(du0, doff0);

            du1 = vis_fmul8x16al(fu, f4);
            dv1 = vis_fmul8x16al(fv, f5);
            dtmp = vis_fpadd16(du1, dv1);
            dg = vis_fpadd16(dtmp, doff1);

            dv2 = vis_fmul8x16al(fv, f8);
            dr = vis_fpadd16(dv2, doff2);

            ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
            ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

            ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
            ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

            db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
            db1 = vis_fpadd16(ddy1, db1);

            db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
            db2 = vis_fpadd16(ddy2, db2);

            db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
            db3 = vis_fpadd16(ddy3, db3);

            db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
            db4 = vis_fpadd16(ddy4, db4);

            dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
            dg1 = vis_fpadd16(ddy1, dg1);

            dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
            dg2 = vis_fpadd16(ddy2, dg2);

            dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
            dg3 = vis_fpadd16(ddy3, dg3);

            dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
            dg4 = vis_fpadd16(ddy4, dg4);

            dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
            dr1 = vis_fpadd16(ddy1, dr1);

            dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
            dr2 = vis_fpadd16(ddy2, dr2);

            dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
            dr3 = vis_fpadd16(ddy3, dr3);

            dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
            dr4 = vis_fpadd16(ddy4, dr4);

            db = vis_fpack16_pair(db1, db2);
            db1 = vis_fpack16_pair(db3, db4);

            dr = vis_fpack16_pair(dr1, dr2);
            dr1 = vis_fpack16_pair(dr3, dr4);

            dg = vis_fpack16_pair(dg1, dg2);
            dg1 = vis_fpack16_pair(dg3, dg4);

            dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
            fu0 = vis_ld_d64_nf(dfu);
            dfu++;
            fu1 = vis_ld_d64_nf(dfu);
            dfu++;
            fu = vis_read_hi(vis_faligndata(fu0, fu1));
            sp2 += 4;

            dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
            fv0 = vis_ld_d64_nf(dfv);
            dfv++;
            fv1 = vis_ld_d64_nf(dfv);
            dfv++;
            fv = vis_read_hi(vis_faligndata(fv0, fv1));
            sp3 += 4;

            ddp += 6;
        }

        if (i <= width - 8) {
            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            db = db1;
            dr = dr1;
            dg = dg1;
            ddp += 3;
            i += 8;
        }

        dp = (mlib_u8 *)ddp;

        vis_alignaddr((void *)(width - i), 0);
        db = vis_faligndata(db, db);
        dg = vis_faligndata(dg, dg);
        dr = vis_faligndata(dr, dr);
        dp += ((width - i - 1) * 3);

        vis_alignaddr((void *)7, 0);
        for (; i < width; i++) {
            STORE_PIXEL(0, 1, 2);
            dp -= 3;
        }

        sp1 = sl1 = sl1 + y_stride;
        sp2 = sl2 = sl2 + uv_stride;
        sp3 = sl3 = sl3 + uv_stride;
        __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3);

        dl = dp = dl + rgb_stride;
        dp = (mlib_u8 *)buf;
        ddp = (mlib_d64 *)dp;
    }

    if (width * 3 > 16 * 1024)
        __mlib_free(tmp);

    return (MLIB_SUCCESS);
}
Ejemplo n.º 7
0
static mlib_status
mlib_v_VideoColorYUV2RGB420_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp11, *sp12, *sl11, *sl12;

/* pointers to dst address */
	mlib_u8 *dp1, *dl1;

/* pointers to dst address */
	mlib_u8 *dp2, *dl2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variables */
	mlib_s32 i, j;
	mlib_s32 y_stride2 = 2 * y_stride;
	mlib_s32 rgb_stride2 = 2 * rgb_stride;
	mlib_s32 off2, off3;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;
	mlib_d64 *buf1, BUFF1[16 * 1024];
	mlib_d64 *buf2, BUFF2[16 * 1024];
	mlib_u8 *tmp1, *tmp2;

	if (width * 3 > 16 * 1024) {
		tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7);
		buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7);
	} else {
		buf1 = (mlib_d64 *)BUFF1;
		buf2 = (mlib_d64 *)BUFF2;
	}

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp11 = sl11 = (mlib_u8 *)y;
	sp12 = sl12 = (mlib_u8 *)y + y_stride;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dp1 = (mlib_u8 *)buf1;
	dp2 = (mlib_u8 *)buf2;
	dl1 = (mlib_u8 *)rgb;
	dl2 = (mlib_u8 *)(rgb + rgb_stride);
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

/*
 * row loop
 */
	for (j = 0; j < height / 2; j++) {
		spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0);
		spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0);

		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		vis_alignaddr((void *)off2, 0);
		fu0 = (*dfu++);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = (*dfv++);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy0 = (*spy1++);
		dy4 = (*spy2++);

		dy3 = vis_ld_d64_nf(spy1); spy1++;
		vis_alignaddr(sp11, 0);
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

		dy5 = vis_ld_d64_nf(spy2); spy2++;
		vis_alignaddr(sp12, 0);
		dy2 = vis_faligndata(dy4, dy5);
		dy4 = dy5;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		vis_alignaddr((void *)off2, 0);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		vis_alignaddr((void *)off3, 0);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd01 = vis_bshuffle(red1, green1);
			dd02 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x03B04C05, 0);
			dd11 = vis_bshuffle(red1, green1);
			dd12 = vis_bshuffle(red2, green2);
			vis_write_bmask(0xD06E07F0, 0);
			dd21 = vis_bshuffle(red1, green1);
			dd22 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x01834967, 0);
			ddp1[0] = vis_bshuffle(dd01, blue1);
			ddp2[0] = vis_bshuffle(dd02, blue2);
			vis_write_bmask(0xA12B45C7, 0);
			ddp1[1] = vis_bshuffle(dd11, blue1);
			ddp2[1] = vis_bshuffle(dd12, blue2);
			vis_write_bmask(0x0D23E56F, 0);
			ddp1[2] = vis_bshuffle(dd21, blue1);
			ddp2[2] = vis_bshuffle(dd22, blue2);

			dy3 = vis_ld_d64_nf(spy1); spy1++;
			vis_alignaddr(sp11, 0);
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			dy5 = vis_ld_d64_nf(spy2); spy2++;
			vis_alignaddr(sp12, 0);
			dy2 = vis_faligndata(dy4, dy5);
			dy4 = dy5;

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
			z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
			z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green1 = vis_fpack16_to_hi(green1, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			red1 = vis_fpack16_to_hi(red1, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
			vis_alignaddr((void *)off3, 0);
			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			green1 = vis_fpack16_to_lo(green1, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
			red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

			green2 = vis_fpack16_to_hi(green2, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

			blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

			red2 = vis_fpack16_to_hi(red2, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

			green2 = vis_fpack16_to_lo(green2, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

			blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
			red2 = vis_fpack16_to_lo(red2, temp_r_lo);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

			ddp1 += 3;
			ddp2 += 3;
		}

		dp1 = (mlib_u8 *)ddp1;
		dp2 = (mlib_u8 *)ddp2;

		vis_alignaddr((void *)(width - i), 0);
		blue1 = vis_faligndata(blue1, blue1);
		green1 = vis_faligndata(green1, green1);
		red1 = vis_faligndata(red1, red1);
		dp1 += ((width - i - 1) * 3);

		blue2 = vis_faligndata(blue2, blue2);
		green2 = vis_faligndata(green2, green2);
		red2 = vis_faligndata(red2, red2);
		dp2 += ((width - i - 1) * 3);

		vis_alignaddr((void *)7, 0);
		for (; i < width; i++) {
			STORE_PIXEL1(0, 1, 2);
			STORE_PIXEL2(0, 1, 2);
			dp1 -= 3;
			dp2 -= 3;
		}

		sp11 = sl11 = sl11 + y_stride2;
		sp12 = sl12 = sl12 + y_stride2;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;
		__mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3);
		__mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3);

		dl1 = dp1 = dl1 + rgb_stride2;
		dl2 = dp2 = dl2 + rgb_stride2;
		dp1 = (mlib_u8 *)buf1;
		dp2 = (mlib_u8 *)buf2;
		ddp1 = (mlib_d64 *)dp1;
		ddp2 = (mlib_d64 *)dp2;
	}

	if (width * 3 > 16 * 1024) {
		__mlib_free(tmp1);
		__mlib_free(tmp2);
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB420_Nearest(
	mlib_u8 *rgb0,
	mlib_u8 *rgb1,
	const mlib_u8 *y0,
	const mlib_u8 *y1,
	const mlib_u8 *cb,
	const mlib_u8 *cr,
	mlib_s32 n)
{
/* pointers to dst address */
	mlib_u8 *dp1, *dp2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 du, dv;

/* (1.00000, 1.40200)*8192 */
	mlib_f32 k12 = vis_to_float(0x20002cdd);

/* (-.34414, -.71414)*8192 */
	mlib_f32 k34 = vis_to_float(0xf4fde926);

/* 1.77200*8192 */
	mlib_f32 k5 = vis_to_float(0x10038b4);

/* (179.45600 - 0.5)*32 */
	mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f);

/* (135.45984 + 0.5)*32 */
	mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff);

/* (226.81600 - 0.5)*32 */
	mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variable */
	mlib_s32 i;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;

	if (n <= 0)
		return (MLIB_FAILURE);

/*
 * initialize GSR scale factor
 */
	vis_write_gsr((2 << 3) + 7);

	dp1 = (mlib_u8 *)rgb0;
	dp2 = (mlib_u8 *)rgb1;
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

	spy1 = (mlib_d64 *)y0;
	spy2 = (mlib_d64 *)y1;
	dfu = (mlib_f32 *)cb;
	dfv = (mlib_f32 *)cr;

	fu = vis_ld_f32_nf(dfu);
	dfu++;
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	du = vis_fpmerge(fu, fu);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/* U*2.0184 */
	u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
	g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

	u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
	g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
	v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
	g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

	v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
	g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
	y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
	b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
	y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
	b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
	z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
	r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
	z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
	r_lo = vis_fpsub16(v_15966_lo, k_179_456);

	temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
	temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

	green1 = vis_fpack16_to_hi(green1, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

	blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
	fu = vis_ld_f32_nf(dfu);
	dfu++;

	red1 = vis_fpack16_to_hi(red1, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	green1 = vis_fpack16_to_lo(green1, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

	blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
	du = vis_fpmerge(fu, fu);

	red1 = vis_fpack16_to_lo(red1, temp_r_lo);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
	temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
	temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

	green2 = vis_fpack16_to_hi(green2, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

	blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

	red2 = vis_fpack16_to_hi(red2, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

	green2 = vis_fpack16_to_lo(green2, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

	blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
	red2 = vis_fpack16_to_lo(red2, temp_r_lo);

	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {

		vis_write_bmask(0x0801902A, 0);
		dd01 = vis_bshuffle(red1, green1);
		dd02 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x03B04C05, 0);
		dd11 = vis_bshuffle(red1, green1);
		dd12 = vis_bshuffle(red2, green2);
		vis_write_bmask(0xD06E07F0, 0);
		dd21 = vis_bshuffle(red1, green1);
		dd22 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x01834967, 0);
		ddp1[0] = vis_bshuffle(dd01, blue1);
		ddp2[0] = vis_bshuffle(dd02, blue2);
		vis_write_bmask(0xA12B45C7, 0);
		ddp1[1] = vis_bshuffle(dd11, blue1);
		ddp2[1] = vis_bshuffle(dd12, blue2);
		vis_write_bmask(0x0D23E56F, 0);
		ddp1[2] = vis_bshuffle(dd21, blue1);
		ddp2[2] = vis_bshuffle(dd22, blue2);

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_179_456);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		fu = vis_ld_f32_nf(dfu);
		dfu++;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		fv = vis_ld_f32_nf(dfv);
		dfv++;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		du = vis_fpmerge(fu, fu);

		red1 = vis_fpack16_to_lo(red1, temp_r_lo);
		dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy1 = vis_ld_d64_nf(spy1);
		spy1++;
		dy2 = vis_ld_d64_nf(spy2);
		spy2++;

		ddp1 += 3;
		ddp2 += 3;
	}

	dp1 = (mlib_u8 *)ddp1;
	dp2 = (mlib_u8 *)ddp2;

	vis_alignaddr((void *)(n - i), 0);
	blue1 = vis_faligndata(blue1, blue1);
	green1 = vis_faligndata(green1, green1);
	red1 = vis_faligndata(red1, red1);
	dp1 += ((n - i - 1) * 3);

	blue2 = vis_faligndata(blue2, blue2);
	green2 = vis_faligndata(green2, green2);
	red2 = vis_faligndata(red2, red2);
	dp2 += ((n - i - 1) * 3);

	vis_alignaddr((void *)7, 0);
	for (; i < n; i++) {
		STORE_PIXEL1(0, 1, 2);
		STORE_PIXEL2(0, 1, 2);
		dp1 -= 3;
		dp2 -= 3;
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 9
0
static void
mlib_v_VideoYUV2ABGR_aarray_411(
	mlib_u32 *abgr,
	const mlib_d64 *y,
	const mlib_f32 *u,
	const mlib_f32 *v,
	const mlib_d64 *a_array,
	mlib_s32 count,
	mlib_s32 left,
	mlib_s32 isrgb)
{
/* all. pointer to dst */
	mlib_d64 *dpp = (mlib_d64 *)abgr;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 *dpa, da0, da1, da2, da3, da4;
	mlib_d64 dtmp;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i;

	if (isrgb) {
		f0 = vis_to_float(0x12a1);
		f1 = vis_to_float(0x3317);
		f4 = vis_to_float(0xe5fa);
		f5 = vis_to_float(0xf375);
		f8 = vis_to_float(0x4097);
		doff0 = vis_to_double_dup(0xe420e420);
		doff1 = vis_to_double_dup(0x10f410f4);
		doff2 = vis_to_double_dup(0xdd60dd60);
	}

	dpa = vis_alignaddr((void *)a_array, 0);

	dy1 = (*y++);
	dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++;
	fu = (*u++);
	fv = (*v++);
	da2 = (*dpa++);
	da3 = vis_ld_d64_nf(dpa); dpa++;
	da4 = vis_ld_d64_nf(dpa); dpa++;

	du0 = vis_fmul8x16al(fu, f1);
	du1 = vis_fmul8x16al(fu, f4);
	dv1 = vis_fmul8x16al(fv, f5);
	dv2 = vis_fmul8x16al(fv, f8);

	if (!((mlib_addr)abgr & 7)) {
#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dpp[8 * i] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 1] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dpp[8 * i + 2] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 3] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dpp[8 * i + 4] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 5] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dpp[8 * i + 6] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 7] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	} else {
		mlib_d64 dd;

#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd);

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	}

	if (left) {
		mlib_d64 res_buf[8];

		da0 = vis_faligndata(da2, da3);
		da1 = vis_faligndata(da3, da4);

		ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
		ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

		ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
		ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

		db = vis_fpadd16(du0, doff0);

		dtmp = vis_fpadd16(du1, dv1);
		dg = vis_fpadd16(dtmp, doff1);

		dr = vis_fpadd16(dv2, doff2);

		db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
		db1 = vis_fpadd16(ddy1, db1);

		db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
		db2 = vis_fpadd16(ddy2, db2);

		db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
		db3 = vis_fpadd16(ddy3, db3);

		db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
		db4 = vis_fpadd16(ddy4, db4);

		dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
		dg1 = vis_fpadd16(ddy1, dg1);

		dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
		dg2 = vis_fpadd16(ddy2, dg2);

		dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
		dg3 = vis_fpadd16(ddy3, dg3);

		dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
		dg4 = vis_fpadd16(ddy4, dg4);

		dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
		dr1 = vis_fpadd16(ddy1, dr1);

		dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
		dr2 = vis_fpadd16(ddy2, dr2);

		dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
		dr3 = vis_fpadd16(ddy3, dr3);

		dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
		dr4 = vis_fpadd16(ddy4, dr4);

		dr = vis_fpack16_pair(dr1, dr2);
		dr1 = vis_fpack16_pair(dr3, dr4);

		dg = vis_fpack16_pair(dg1, dg2);
		dg1 = vis_fpack16_pair(dg3, dg4);

		db = vis_fpack16_pair(db1, db2);
		db1 = vis_fpack16_pair(db3, db4);

		dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
		dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

		res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
		dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

		res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
		dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

		res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
		dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

		res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		for (i = 0; i < left; i++)
			((mlib_f32 *)dpp)[16 * count + i] =
				((mlib_f32 *)res_buf)[i];
	}
}
Ejemplo n.º 10
0
mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
                                  const mlib_image *src,
                                  mlib_s32         m,
                                  mlib_s32         n,
                                  mlib_s32         dm,
                                  mlib_s32         dn,
                                  const mlib_s32   *kern,
                                  mlib_s32         scale,
                                  mlib_s32         cmask)
{
  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
  mlib_d64 dd, d0, d1;
  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
  mlib_u8 *sl, *sp, *dl;
  mlib_s32 hgt = mlib_ImageGetHeight(src);
  mlib_s32 wid = mlib_ImageGetWidth(src);
  mlib_s32 sll = mlib_ImageGetStride(src);
  mlib_s32 dll = mlib_ImageGetStride(dst);
  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
  mlib_d64 *pbuff, *dp;
  mlib_f32 *karr = (mlib_f32 *) kern;
  mlib_s32 gsr_scale = (31 - scale) << 3;
  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
  mlib_s32 i, j, l, chan, testchan;
  mlib_s32 nchan = mlib_ImageGetChannels(dst);
  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);

  if (n > MAX_N) {
    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));

    if (buffs == NULL)
      return MLIB_FAILURE;
  }

  buff = buffs + 2 * (n + 1);

  adr_dst += dn * dll + dm * nchan;

  ssize = wid;
  dsize = (ssize + 7) / 8;
  esize = dsize + 4;
  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));

  if (pbuff == NULL) {
    if (buffs != buffs_local)
      mlib_free(buffs);
    return MLIB_FAILURE;
  }

  for (i = 0; i < (n + 1); i++)
    buffs[i] = pbuff + i * esize;
  for (i = 0; i < (n + 1); i++)
    buffs[(n + 1) + i] = buffs[i];
  buffd = buffs[n] + esize;
  buffe = buffd + 2 * esize;

  hgt -= (n - 1);
  xsize = ssize - (m - 1);

  vis_write_gsr(gsr_scale + 7);

  if (nchan == 2) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
  }
  else if (nchan == 3) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
  }
  else {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
  }

  testchan = 1;
  for (chan = 0; chan < nchan; chan++) {
    buff_ind = 0;
    sl = adr_src;
    dl = adr_dst;

    if ((cmask & testchan) == 0) {
      testchan <<= 1;
      continue;
    }

    for (l = 0; l < n; l++) {
      mlib_d64 *buffn = buffs[l];
      sp = sl + l * sll;

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
    }

    /* init buffer */
#pragma pipeloop(0)
    for (i = 0; i < (xsize + 7) / 8; i++) {
      buffd[2 * i] = drnd;
      buffd[2 * i + 1] = drnd;
    }

    for (j = 0; j < hgt; j++) {
      mlib_d64 **buffc = buffs + buff_ind;
      mlib_f32 *pk = karr, k0, k1, k2, k3;
      sp = sl + n * sll;

      for (l = 0; l < n; l++) {
        buff[l] = buffc[l];
      }

      buffn = buffc[n];

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);

      ik_last = (m - 1);

      for (jk = 0; jk < n; jk += jk_size) {
        jk_size = n - jk;

        if (jk_size >= 6)
          jk_size = 4;

        if (jk_size == 5)
          jk_size = 3;

        coff = 0;

        if (jk_size == 1) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];

            doff = coff / 8;
            buff0 = buff[jk] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s01 = buff0[i + 1];
              s0 = vis_faligndata(s00, s01);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d1 = vis_fpadd16(d01, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += m;
        }
        else if (jk_size == 2) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
            s11 = buff1[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s10 = s11;
              s01 = buff0[i + 1];
              s11 = buff1[i + 1];
              s0 = vis_faligndata(s00, s01);
              s1 = vis_faligndata(s10, s11);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d0 = vis_fpadd16(d10, d0);
              d1 = vis_fpadd16(d01, d1);
              d1 = vis_fpadd16(d11, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += 2 * m;
        }
        else if (jk_size == 3) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 3 * m;
        }
        else {                              /* jk_size == 4 */

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];
            k3 = pk[ik + 3 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;
            buff3 = buff[jk + 3] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {

#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];
                s3 = buff3[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);
                s3 = vis_faligndata(s30, s31);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 4 * m;
        }
      }

      /*****************************************
       *****************************************
       **          Final iteration            **
       *****************************************
       *****************************************/

      jk_size = n;

      if (jk_size >= 6)
        jk_size = 4;

      if (jk_size == 5)
        jk_size = 3;

      k0 = karr[ik_last];
      k1 = karr[ik_last + m];
      k2 = karr[ik_last + 2 * m];
      k3 = karr[ik_last + 3 * m];

      off = ik_last;
      doff = off / 8;
      off &= 7;
      buff0 = buff[0] + doff;
      buff1 = buff[1] + doff;
      buff2 = buff[2] + doff;
      buff3 = buff[3] + doff;
      vis_write_gsr(gsr_scale + off);

      if (jk_size == 1) {
        dp = buffe;

        s01 = buff0[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s01 = buff0[i + 1];
          s0 = vis_faligndata(s00, s01);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d1 = vis_fpadd16(d1, d01);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 2) {
        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 3) {

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else {                                /* if (jk_size == 4) */

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
        s31 = buff3[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s30 = s31;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s31 = buff3[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);
          s3 = vis_faligndata(s30, s31);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d0 = vis_fpadd16(d0, d30);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);
          d1 = vis_fpadd16(d1, d31);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }

      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);

      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= (n + 1))
        buff_ind = 0;
    }

    testchan <<= 1;
  }

  mlib_free(pbuff);

  if (buffs != buffs_local)
    mlib_free(buffs);

  return MLIB_SUCCESS;
}
Ejemplo n.º 11
0
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 12
0
mlib_status
__mlib_VideoUpSample420(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src0,
	const mlib_u8 *src1,
	const mlib_u8 *src2,
	mlib_s32 n)
{
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d10, d11, d20, d21;
	mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo;
	mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo;
	mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo;
	mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo;
	mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
	mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7;
	mlib_d64 data0, data1, data2, data3, tmp0, tmp1;
	mlib_f32 fone = vis_to_float(0x4000000);
	mlib_f32 fthree = vis_to_float(0xC000000);
	mlib_f32 fone1 = vis_to_float(0x40404040);
	mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0);
	mlib_d64 dseven = vis_to_double_dup(0x70007);
	mlib_d64 deight = vis_to_double_dup(0x80008);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((3 << 3) + 2);

	d00 = vis_ld_d64_nf(sp0);
	d10 = vis_ld_d64_nf(sp1);
	d20 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone);
	lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone);
	lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone);
	lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone);
	tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree);
	tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree);
	lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0);
	lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1);
	lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0);
	lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1);

#pragma pipeloop(0)
	for (i = 0; i < n - 8; i += 8) {
		d01 = *sp0;
		d11 = *sp1;
		d21 = *sp2;
		sp0++;
		sp1++;
		sp2++;

		thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone);
		thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone);
		thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone);
		thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone);

		tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree);
		tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree);

		thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0);
		thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1);
		thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0);
		thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1);

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, thiscolsum0_hi);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, thiscolsum1_hi);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0));
		dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));
		dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2));

		dp0 += 2;
		dp1 += 2;
		lastcolsum0_hi = thiscolsum0_hi;
		lastcolsum0_lo = thiscolsum0_lo;
		lastcolsum1_hi = thiscolsum1_hi;
		lastcolsum1_lo = thiscolsum1_lo;
	}

	if (i < n) {

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, lastcolsum0_lo);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, lastcolsum1_lo);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));

		emask = vis_edge8(dp0, dend0);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			acc0 = vis_fpmerge(vis_read_lo(data1),
				vis_read_lo(data0));
			acc1 = vis_fpmerge(vis_read_lo(data3),
				vis_read_lo(data2));

			emask = vis_edge8(dp0, dend0);
			vis_pst_8(acc0, dp0, emask);
			vis_pst_8(acc1, dp1, emask);
		}
	}

	vis_write_gsr(7);

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;

	ac0 = *dp0;
	ac2 = *dp1;

#pragma pipeloop(0)
	for (i = 0; i < 2 * n - 8; i += 8) {
		ac1 = *dp0;
		ac3 = *dp1;
		*dp0 = vis_faligndata(ac0, ac1);
		*dp1 = vis_faligndata(ac2, ac3);
		dp0++;
		dp1++;
		ac0 = ac1;
		ac2 = ac3;
	}

	if (i < 2 * n) {
		ac1 = vis_ld_d64_nf(dp0);
		ac3 = vis_ld_d64_nf(dp1);
		emask = vis_edge8(dp0, dend0);
		acc0 = vis_faligndata(ac0, ac1);
		acc1 = vis_faligndata(ac2, ac3);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
Ejemplo n.º 13
0
static mlib_status
mlib_v_VideoColorYUV2ABGR411_dst_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy0, dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 dd, dd0, dd1, dtmp;

/* used to load u, v into mlib_f32 */
	mlib_f32 ffu[1], ffv[1];

/* used to load u, v into mlib_f32 */
	mlib_u8 *ufu, *vfu;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_d64 *buf;
	mlib_s32 inc;

	ufu = (mlib_u8 *)ffu;
	vfu = (mlib_u8 *)ffv;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(3 << 3);

	buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64));

	if (buf == NULL)
		return (MLIB_FAILURE);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = buf;
		dy0 = vis_ld_d64_nf(spy); spy++;

#pragma pipeloop(0)
		for (i = 0; i < width; i += 8) {
			dy1 = vis_ld_d64_nf(spy); spy++;
			(*dpp++) = vis_faligndata(dy0, dy1);
			dy0 = dy1;
		}

		spy = buf;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);

		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		vis_alignaddr((void *)(8 - i), 0);
		inc = (emask1 != 0xff);
		emask1 &= emask;

		ufu[0] = vis_ld_u8_nf(sp2);
		ufu[1] = vis_ld_u8_nf(sp2 + 1);
		ufu[2] = vis_ld_u8_nf(sp2 + 2);
		ufu[3] = vis_ld_u8_nf(sp2 + 3);
		vfu[0] = vis_ld_u8_nf(sp3);
		vfu[1] = vis_ld_u8_nf(sp3 + 1);
		vfu[2] = vis_ld_u8_nf(sp3 + 2);
		vfu[3] = vis_ld_u8_nf(sp3 + 3);
		sp2 += 4;
		sp3 += 4;

		fu = ffu[0];
		fv = ffv[0];

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 16; i += 16) {

			dy1 = (*spy++);
			dy2 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = vis_ld_u8_nf(sp2);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);
			ufu[1] = vis_ld_u8_nf(sp2 + 1);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			ufu[2] = vis_ld_u8_nf(sp2 + 2);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);
			ufu[3] = vis_ld_u8_nf(sp2 + 3);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);
			vfu[0] = vis_ld_u8_nf(sp3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);
			vfu[1] = vis_ld_u8_nf(sp3 + 1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);
			vfu[2] = vis_ld_u8_nf(sp3 + 2);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);
			vfu[3] = vis_ld_u8_nf(sp3 + 3);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
			fu = ffu[0];
			fv = ffv[0];
			sp2 += 4;
			sp3 += 4;
			emask1 = emask;
		}

		if (i <= width - 8) {

			dy1 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = ufu[2];

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			vfu[0] = vfu[2];

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr = vis_fpack16_pair(dr1, dr2);
			dg = vis_fpack16_pair(dg1, dg2);
			db = vis_fpack16_pair(db1, db2);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			fu = ffu[0];
			fv = ffv[0];

			i += 8;
			emask1 = emask;
		}

		if (i < width) {

			dy1 = vis_ld_d64_nf(spy);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			fu = vis_fpack16(db1);

			dg2 = vis_fpmerge(fu, vis_fpack16(dg1));
			dg3 = vis_fpmerge(fu, vis_fpack16(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
		}

		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}
	__mlib_free(buf);
	return (MLIB_SUCCESS);
}
void
__mlib_VideoColorYUV444int_to_UYVY422int(
	mlib_u32 *uyvy,
	const mlib_u8 *yuv,
	mlib_s32 w,
	mlib_s32 h,
	mlib_s32 dlb,
	mlib_s32 slb)
{
	mlib_s32 i, val_y0, val_y1, val_u0, val_v0, count, left;

	dlb >>= 2;
	w >>= 1;
	count = w >> 2;
	left = w - (count << 2);

	if (w == 0 || h == 0)
		return;

	vis_write_gsr(6 << 3);

	for (i = 0; i < h; i++, yuv += slb, uyvy += dlb) {
		if ((((mlib_addr)yuv | (mlib_addr)uyvy) & 7) == 0) {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_d64 w_ld0, w_ld1, w_ld2;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;

#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				w_ld0 = ((mlib_d64 *)yuv)[3 * j];
				w_ld1 = ((mlib_d64 *)yuv)[3 * j + 1];
				w_ld2 = ((mlib_d64 *)yuv)[3 * j + 2];

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)uyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)uyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				w_ld0 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count);
				w_ld1 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count + 1);
				w_ld2 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count + 2);

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)uyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}
		} else {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_d64 w_ld0, w_ld1, w_ld2;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;
			mlib_d64 *al_addr;
			mlib_d64 l0, l1, l2, l3;
			const mlib_u8 *pyuv = yuv;
			mlib_u32 *puyvy = uyvy;

			if ((mlib_addr)puyvy & 7) {
				val_y0 = yuv[0];
				val_y1 = yuv[3];
				val_u0 = (yuv[1] + yuv[4]) >> 1;
				val_v0 = (yuv[2] + yuv[5]) >> 1;
				puyvy[0] =
					(val_u0 << 24) | (val_y0 << 16) |
					(val_v0 << 8) | val_y1;
				pyuv += 6;
				puyvy++;
				count = (w - 1) >> 2;
				left = (w - 1) - (count << 2);
			} else {
				count = w >> 2;
				left = w - (count << 2);
			}

			al_addr = vis_alignaddr((void *)pyuv, 0);
			l0 = vis_ld_d64_nf(al_addr); al_addr++;
#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				l1 = (*al_addr++);
				l2 = (*al_addr++);
				l3 = vis_ld_d64_nf(al_addr); al_addr++;
				w_ld0 = vis_faligndata(l0, l1);
				w_ld1 = vis_faligndata(l1, l2);
				w_ld2 = vis_faligndata(l2, l3);
				l0 = l3;
				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)puyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)puyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				l1 = vis_ld_d64_nf(al_addr); al_addr++;
				l2 = vis_ld_d64_nf(al_addr); al_addr++;
				l3 = vis_ld_d64_nf(al_addr);
				w_ld0 = vis_faligndata(l0, l1);
				w_ld1 = vis_faligndata(l1, l2);
				w_ld2 = vis_faligndata(l2, l3);

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)puyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}

			count = w >> 2;
			left = w - (count << 2);
		}
void
__mlib_VideoColorYUV444seq_to_UYVY422int(
	mlib_u32 *uyvy,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 w,
	mlib_s32 h,
	mlib_s32 dlb,
	mlib_s32 slb)
{
	mlib_s32 i, j2, val_y0, val_y1, val_u0, val_v0, count, left;

	dlb >>= 2;
	w >>= 1;

	if (w == 0 || h == 0)
		return;

	count = w >> 2;
	left = w - (count << 2);

	vis_write_gsr(6 << 3);

	for (i = 0; i < h; i++, y += slb, u += slb, v += slb, uyvy += dlb) {
		if ((((mlib_addr)u | (mlib_addr)v | (mlib_addr)y | (mlib_addr)
			uyvy) & 7) == 0) {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;

#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				w_y = ((mlib_d64 *)y)[j];
				w_u = ((mlib_d64 *)u)[j];
				w_v = ((mlib_d64 *)v)[j];

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)uyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)uyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				w_y = ((mlib_d64 *)y)[count];
				w_u = ((mlib_d64 *)u)[count];
				w_v = ((mlib_d64 *)v)[count];

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)uyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}
		} else {

#pragma pipeloop(0)
			for (j2 = 0; j2 < w; j2++) {
				mlib_s32 j = 2 * j2;
				mlib_s32 j1 = j + 1;

				val_y0 = y[j];
				val_y1 = y[j1];
				val_u0 = (u[j] + u[j1]) >> 1;
				val_v0 = (v[j] + v[j1]) >> 1;
				uyvy[j2] =
					(val_u0 << 24) | (val_y0 << 16) |
					(val_v0 << 8) | val_y1;
			}
		}
	}
}