mlib_status
__mlib_VideoInterpAveX_U8_U8_8x16(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_s32 y;
    mlib_d64 *dd, ss0[16], *sp1, *sp2, s1hi, s1lo, s2hi, s2lo, s2;
    mlib_d64 mthree = vis_fone();
    mlib_f32 fzero = vis_fzeros();
    mlib_f32 fexpd2 = vis_to_float(0x1000200);

    mthree = vis_fpadd16(mthree, vis_fpadd16(mthree, mthree));

    dd = (mlib_d64 *)curr_block;

    sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0);

#pragma pipeloop(0)
    MLIB_V_VIDEOCOPY8(16);

    vis_write_gsr((5 << 3) + ((mlib_s32)(ref_block + 1) & 7));
    sp2 = (mlib_d64 *)((mlib_addr)(ref_block + 1) & ~7);

#pragma pipeloop(0)
    MLIB_V_VIDEOINTERPAVG8(16);

    return (MLIB_SUCCESS);
}
static void FUNC(
    m4) (
    FUNC_M_ARG)
{
	mlib_s32 i;
	mlib_d64 k0 = pkern[0];
	mlib_d64 k1 = pkern[1];
	mlib_d64 k2 = pkern[2];
	mlib_d64 k3 = pkern[3];
	mlib_d64 a0, a1, a2, a3, sum;
	mlib_d64 *perror = vis_alignaddr(perror1, 0);

	a0 = (*perror++);
	a1 = (*perror++);
	a2 = (*perror++);

	for (i = 0; i < sw; i++) {
		a3 = (*perror++);
		sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0));
		sum = vis_fpadd16(sum, FMUL_16x16(k1, a1));
		sum = vis_fpadd16(sum, FMUL_16x16(k2, a2));
		buffd[i] = vis_fpadd16(sum, FMUL_16x16(k3, a3));
		a0 = a1;
		a1 = a2;
		a2 = a3;
	}
}
void
mlib_v_ImageAffineTableLine_8nw_3_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;
	mlib_d64 yFilter2;
	mlib_d64 yFilter3;
	mlib_d64 row20, row30;
	mlib_d64 *dpSrc;
	mlib_d64 data0, data1, zero;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

	zero = vis_to_double_dup(0);

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_3x2;
		FILTER_MERGE_4x2;
		MAKE_4x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, yFilter);
		xFilter = vis_write_hi(xFilter, fx0);
		LOAD_PIXEL_3;

		v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter));
		v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter));
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter));
		sum = vis_fpadd16(v0, sum);

		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
Пример #4
0
mlib_status
__mlib_VideoDCT4x4_S16_S16(
	mlib_s16 *coeff,
	const mlib_s16 *blk)
{
	mlib_d64 a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3;

	mlib_f32 a = ((mlib_f32 *)mlib_dct4vtab)[0];
	mlib_f32 b = ((mlib_f32 *)mlib_dct4vtab)[1];

	mlib_f32 c = ((mlib_f32 *)mlib_dct4vtab)[2];

	mlib_d64 *src = (mlib_d64 *)blk;
	mlib_d64 *dst = (mlib_d64 *)coeff;

/* column 1D DCT */
	vis_write_bmask(0x018923ab, 0x0);

	a3 = vis_fpsub16(src[0], src[3]);
	a0 = vis_fpadd16(src[0], src[3]);
	a2 = vis_fpsub16(src[1], src[2]);
	a1 = vis_fpadd16(src[1], src[2]);

	c1 = vis_fpadd16(a3, a2);
	c3 = vis_fpsub16(a3, a2);
	b1 = vis_fpadd16(vis_fmul8x16(a, c1), vis_fmul8x16(b, c3));
	b3 = vis_fpsub16(vis_fmul8x16(a, c3), vis_fmul8x16(b, c1));
	b0 = vis_fpadd16(a0, a1);
	b2 = vis_fpsub16(a0, a1);

	TRANSPOSE_VIS2(b0, b1, b2, b3, c0, c1, c2, c3);

	a3 = vis_fpsub16(c0, c3);
	a0 = vis_fpadd16(c0, c3);
	a2 = vis_fpsub16(c1, c2);
	a1 = vis_fpadd16(c1, c2);

	c1 = vis_fpadd16(a3, a2);
	c3 = vis_fpsub16(a3, a2);
	b1 = vis_fpadd16(vis_fmul8x16(a, c1), vis_fmul8x16(b, c3));
	b3 = vis_fpsub16(vis_fmul8x16(a, c3), vis_fmul8x16(b, c1));
	b0 = vis_fpadd16(a0, a1);
	b2 = vis_fpsub16(a0, a1);

	b0 = vis_fmul8x16(c, b0);
	b2 = vis_fmul8x16(c, b2);
	b1 = vis_fmul8x16(c, b1);
	b3 = vis_fmul8x16(c, b3);

	TRANSPOSE_VIS2(b0, b1, b2, b3, dst[0], dst[1], dst[2], dst[3]);

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageAffineTableLine_8nw_2_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_2x2(row00, row10);
		FILTER_MERGE;
		MAKE_2x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

#pragma pipeloop(0)
	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, fy0);
		xFilter = vis_write_lo(xFilter, fx0);

		row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1));
		row10 =
		    vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr,
		    srcStride + 1));

		v0 = vis_fmul8x16au(vis_read_lo(row00), fy0);
		v1 = vis_fmul8x16al(vis_read_lo(row10), fy0);
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
Пример #6
0
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_f32 *pd = (mlib_f32 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp0, tmp1, data;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, bias = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);

#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {
		d0 = (*sp0++);
		tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0));
		tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0));

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16(data);
	}

	dst = (mlib_u8 *)pd;

	for (; i < n; i += 2) {
		(*dst++) = (src[i] + src[i + 1] + bias) >> 1;
/* 1=>2, 2=>1 */
		bias ^= 1;
	}

	return (MLIB_SUCCESS);
}
static void FUNC(
    m2) (
    FUNC_M_ARG)
{
	mlib_s32 i;
	mlib_d64 k0 = pkern[0];
	mlib_d64 k1 = pkern[1];
	mlib_d64 a0, a1, aa, sum;
	mlib_d64 *perror = vis_alignaddr(perror1, 0);

	a0 = (*perror++);

	for (i = 0; i < (sw + 3) / 4; i++) {
		aa = (*perror++);
		a1 = vis_faligndata(a0, aa);
		sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0));
		buffd[i] = vis_fpadd16(sum, FMUL_16x16(k1, a1));
		a0 = aa;
	}
}
Пример #8
0
mlib_status
__mlib_VideoAddBlock_U8_S16(
	mlib_u8 *curr_block,
	const mlib_s16 *mc_block,
	mlib_s32 stride)
{
	mlib_s32 y;
	mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd;
	mlib_f32 zeros = vis_fzeros();

/*
 *   mlib_s32 mlib_imult = 0x100;
 *   mlib_f32 mult  = *(mlib_f32*) & mlib_imult;
 */
	mlib_f32 mult = vis_to_float(0x100);

	vis_write_gsr(7 << 3);

	dp = (mlib_d64 *)curr_block;
	sp = (mlib_d64 *)mc_block;

#pragma pipeloop(0)
	for (y = 0; y < 8; y++) {

		dd = *dp;
		s1hi = (*sp++);
		s1lo = (*sp++);
		s2hi = vis_fpmerge(zeros, vis_read_hi(dd));
		s2lo = vis_fmul8x16al(vis_read_lo(dd), mult);

		s1hi = vis_fpadd16(s1hi, s2hi);
		s1lo = vis_fpadd16(s1lo, s2lo);

		*dp = vis_fpack16_pair(s1hi, s1lo);
		dp = (mlib_d64 *)((mlib_u8 *)dp + stride);
	}

	return (MLIB_SUCCESS);
}
static void FUNC(
    m1) (
    FUNC_M_ARG)
{
	mlib_s32 i;
	mlib_d64 k0 = pkern[0];
	mlib_d64 a0, e0, e1;
	mlib_d64 *perror = vis_alignaddr(perror1, 0);

	e0 = (*perror++);

	for (i = 0; i < (sw + 3) / 4; i++) {
		e1 = (*perror++);
		a0 = vis_faligndata(e0, e1);
		buffd[i] = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0));
		e0 = e1;
	}
}
mlib_status
__mlib_VideoInterpAveX_U8_U8_16x16(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_d64 s0, s1, s2, s3, s4, s5, s6;
    mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3;
    mlib_d64 *sd, *dd;
    mlib_d64 dzero = vis_fzero();
    const mlib_f32 fm2 = vis_to_float(0x1000200);
    mlib_f32 fzero = vis_read_hi(dzero);
    mlib_d64 rounder = vis_fpsub16(dzero, vis_fone());
    mlib_s32 y;

    rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder);
    vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7));
    dd = (mlib_d64 *)curr_block;
    sd = (mlib_d64 *)((mlib_addr)ref_block & ~7);

    y = 8;

    if (((mlib_s32)(ref_block + 1) & 7)) {
        do {
            s0 = sd[0];
            s1 = sd[1];
            s2 = sd[2];
            sd0 = vis_faligndata(s0, s1);
            sd1 = vis_faligndata(s1, s2);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            s4 = sd[0];
            s5 = sd[1];
            s6 = sd[2];
            sd2 = vis_faligndata(s4, s5);
            sd3 = vis_faligndata(s5, s6);
            vis_alignaddr((void *)(ref_block + 1), 0);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            d0 = dd[0];
            d1 = dd[1];
            d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0];
            d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1];
            s0 = vis_faligndata(s0, s1);
            s1 = vis_faligndata(s1, s2);
            s2 = vis_faligndata(s4, s5);
            s3 = vis_faligndata(s5, s6);

            MLIB_V_VIDEOINTERPAVG(d0, sd0, s0);
            MLIB_V_VIDEOINTERPAVG(d1, sd1, s1);
            MLIB_V_VIDEOINTERPAVG(d2, sd2, s2);
            MLIB_V_VIDEOINTERPAVG(d3, sd3, s3);

            dd[0] = d0;
            dd[1] = d1;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            dd[0] = d2;
            dd[1] = d3;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            vis_alignaddr((void *)ref_block, 0);
        } while (--y);
    } else {
        do {
            s0 = sd[0];
            s1 = sd[1];
            s2 = sd[2];
            sd0 = vis_faligndata(s0, s1);
            sd1 = vis_faligndata(s1, s2);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            s4 = sd[0];
            s5 = sd[1];
            s6 = sd[2];
            sd2 = vis_faligndata(s4, s5);
            sd3 = vis_faligndata(s5, s6);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            d0 = dd[0];
            d1 = dd[1];
            d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0];
            d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1];

            MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1);
            MLIB_V_VIDEOINTERPAVG(d1, sd1, s2);
            MLIB_V_VIDEOINTERPAVG(d2, sd2, s5);
            MLIB_V_VIDEOINTERPAVG(d3, sd3, s6);

            dd[0] = d0;
            dd[1] = d1;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            dd[0] = d2;
            dd[1] = d3;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
        } while (--y);
    }
    return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
Пример #12
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 fzero = vis_fzeros();
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_u8 c;
	mlib_d64 dsp = vis_to_double_dup(0x800080);
	mlib_d64 rst = vis_to_double_dup(0x80808080);
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d4 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			d2 = vis_fpmerge(fzero, vis_read_hi(d4));
			d3 = vis_fmul8x16al(vis_read_lo(d4), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
			(*ddst++) = vis_fxor(d4, rst);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d1 = vis_faligndata(d1, d2);
			d3 = vis_fmul8x16al(vis_read_hi(d1), fm);
			d4 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpadd16(dsp, d4);
			d1 = vis_fpack16_pair(d3, d4);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_faligndata(d1, d2);
			d4 = vis_fmul8x16al(vis_read_hi(d3), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d3), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d3 = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8x16al(vis_read_hi(d6), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d6), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d6 = vis_fpack16_pair(d4, d5);
			(*ddst++) = vis_fxor(d3, rst);
			(*ddst++) = vis_fxor(d6, rst);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c;

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageAffineTableLine_8nw_2_2_4(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	mlib_f32 yFilter;

	i = 0;

	if (i <= size - 6) {

		CALC_SRC_PTR;
		LOAD_1PIXEL_2x2;

		CALC_SRC_PTR;

		MAKE_2x2(0);
		MAKE_2x2(1);

		FADD_4x2;

		MAKE_2x2(0);
		MAKE_2x2(1);

#pragma pipeloop(0)
		for (; i <= size - 8; i += 2) {
			*buff = res0;
			buff++;
			*buff = res1;
			buff++;
			FADD_4x2;
			MAKE_2x2(0);
			MAKE_2x2(1);
		}

		*buff = res0;
		buff++;
		*buff = res1;
		buff++;
		FADD_4x2;
		*buff = res0;
		buff++;
		*buff = res1;
		buff++;

		RESULT_1PIXEL_2x2(0);
		LOAD_1PIXEL_2x2;
		RESULT_1PIXEL_2x2(1);
		FADD_4x2;
		*buff = res0;
		buff++;
		*buff = res1;
		buff++;

		i += 6;
	}
#pragma pipeloop(0)
	for (; i < size; i++) {
		CALC_SRC_PTR;
		LOAD_1PIXEL_2x2;
		RESULT_1PIXEL_2x2(0);
		res0 = vis_fpadd16(d00, d10);
		*buff = res0;
		buff++;
	}
}
Пример #14
0
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
Пример #15
0
static mlib_status
mlib_v_VideoColorYUV2RGB444_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 yuv_stride)
{
/* all. pointer to y, u, v */
	mlib_d64 *spy, *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2;

/* loop variable */
	mlib_s32 i, j;
	mlib_d64 *buf, BUFF[16 * 1024];
	mlib_u8 *tmp, *dp;

	if (width * 3 > 16 * 1024) {
		tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);

		if (tmp == NULL)
			return (MLIB_FAILURE);
		buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
	} else {
		buf = (mlib_d64 *)BUFF;
	}

	dp = (mlib_u8 *)buf;
	ddp = (mlib_d64 *)dp;

	for (j = 0; j < height; j++) {

		dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0);
		du0 = (*dfu++);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

		dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0);
		dv0 = (*dfv++);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		spy = (mlib_d64 *)vis_alignaddr((void *)y, 0);
		dy0 = (*spy++);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

		vis_alignaddr((void *)u, 0);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

		vis_alignaddr((void *)v, 0);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		g_hi = vis_fpadd16(g_hi, y_11644_hi);

		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
		g_lo = vis_fpadd16(g_lo, y_11644_lo);

		green = vis_fpack16_pair(g_hi, g_lo);
		b_hi = vis_fpadd16(b_hi, y_11644_hi);
		b_lo = vis_fpadd16(b_lo, y_11644_lo);

		blue = vis_fpack16_pair(b_hi, b_lo);
		r_hi = vis_fpadd16(r_hi, y_11644_hi);
		r_lo = vis_fpadd16(r_lo, y_11644_lo);

		red = vis_fpack16_pair(r_hi, r_lo);

		vis_alignaddr((void *)y, 0);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd0 = vis_bshuffle(red, green);
			vis_write_bmask(0x03B04C05, 0);
			dd1 = vis_bshuffle(red, green);
			vis_write_bmask(0xD06E07F0, 0);
			dd2 = vis_bshuffle(red, green);
			vis_write_bmask(0x01834967, 0);
			ddp[0] = vis_bshuffle(dd0, blue);
			vis_write_bmask(0xA12B45C7, 0);
			ddp[1] = vis_bshuffle(dd1, blue);
			vis_write_bmask(0x0D23E56F, 0);
			ddp[2] = vis_bshuffle(dd2, blue);

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);
			vis_alignaddr((void *)u, 0);
			du1 = vis_ld_d64_nf(dfu); dfu++;
			du = vis_faligndata(du0, du1);
			du0 = du1;

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);
			vis_alignaddr((void *)v, 0);
			dv1 = vis_ld_d64_nf(dfv); dfv++;
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			g_hi = vis_fpadd16(g_hi, y_11644_hi);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			g_lo = vis_fpadd16(g_lo, y_11644_lo);

			green = vis_fpack16_pair(g_hi, g_lo);
			b_hi = vis_fpadd16(b_hi, y_11644_hi);
			b_lo = vis_fpadd16(b_lo, y_11644_lo);

			blue = vis_fpack16_pair(b_hi, b_lo);
			r_hi = vis_fpadd16(r_hi, y_11644_hi);
			r_lo = vis_fpadd16(r_lo, y_11644_lo);

			red = vis_fpack16_pair(r_hi, r_lo);

			vis_alignaddr((void *)y, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			ddp += 3;
		}

		dp = (mlib_u8 *)ddp;

		vis_alignaddr((void *)(width - i), 0);
		blue = vis_faligndata(blue, blue);
		green = vis_faligndata(green, green);
		red = vis_faligndata(red, red);
		dp += ((width - i - 1) * 3);

		vis_alignaddr((void *)spy, 7);
		for (; i < width; i++) {
			STORE_PIXEL(0, 1, 2);
			dp -= 3;
		}

		__mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3);

		rgb += rgb_stride;
		dp = (mlib_u8 *)buf;
		ddp = (mlib_d64 *)dp;
		y += yuv_stride;
		u += yuv_stride;
		v += yuv_stride;
	}

	if (width * 3 > 16 * 1024)
		__mlib_free(tmp);
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoInterpAveX_U8_U8(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7;
    mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3;
    mlib_d64 *sd, *dd;
    mlib_d64 dzero = vis_fzero();
    const mlib_f32 fm2 = vis_to_float(0x1000200);
    mlib_f32 fzero = vis_read_hi(dzero);
    mlib_d64 rounder = vis_fpsub16(dzero, vis_fone());
    mlib_s32 y;

    rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder);
    vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7));
    dd = (mlib_d64 *)curr_block;
    sd = (mlib_d64 *)((mlib_addr)ref_block & ~7);

    if (width == 8) {
        y = height >> 2;

        if (((mlib_s32)(ref_block + 1) & 7)) {
            do {
                s0 = sd[0];
                s1 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd0 = vis_faligndata(s0, s1);
                s2 = sd[0];
                s3 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd1 = vis_faligndata(s2, s3);
                s4 = sd[0];
                s5 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd2 = vis_faligndata(s4, s5);
                s6 = sd[0];
                s7 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd3 = vis_faligndata(s6, s7);
                vis_alignaddr((void *)(ref_block + 1), 0);
                d0 = *dd;
                d1 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   field_stride);
                d2 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   2 * field_stride);
                d3 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   3 * field_stride);
                s0 = vis_faligndata(s0, s1);
                s1 = vis_faligndata(s2, s3);
                s2 = vis_faligndata(s4, s5);
                s3 = vis_faligndata(s6, s7);

                MLIB_V_VIDEOINTERPAVG(d0, sd0, s0);
                MLIB_V_VIDEOINTERPAVG(d1, sd1, s1);
                MLIB_V_VIDEOINTERPAVG(d2, sd2, s2);
                MLIB_V_VIDEOINTERPAVG(d3, sd3, s3);

                *dd = d0;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d1;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d2;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d3;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

                vis_alignaddr((void *)ref_block, 0);
            } while (--y);
        } else {
            do {
                s0 = sd[0];
                s1 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd0 = vis_faligndata(s0, s1);
                s2 = sd[0];
                s3 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd1 = vis_faligndata(s2, s3);
                s4 = sd[0];
                s5 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd2 = vis_faligndata(s4, s5);
                s6 = sd[0];
                s7 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd3 = vis_faligndata(s6, s7);
                d0 = *dd;
                d1 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   field_stride);
                d2 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   2 * field_stride);
                d3 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   3 * field_stride);

                MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1);
                MLIB_V_VIDEOINTERPAVG(d1, sd1, s3);
                MLIB_V_VIDEOINTERPAVG(d2, sd2, s5);
                MLIB_V_VIDEOINTERPAVG(d3, sd3, s7);

                *dd = d0;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d1;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d2;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d3;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

            } while (--y);
        }
    } else {
Пример #17
0
static mlib_status
mlib_v_VideoColorYUV2RGB420_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp11, *sp12, *sl11, *sl12;

/* pointers to dst address */
	mlib_u8 *dp1, *dl1;

/* pointers to dst address */
	mlib_u8 *dp2, *dl2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variables */
	mlib_s32 i, j;
	mlib_s32 y_stride2 = 2 * y_stride;
	mlib_s32 rgb_stride2 = 2 * rgb_stride;
	mlib_s32 off2, off3;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;
	mlib_d64 *buf1, BUFF1[16 * 1024];
	mlib_d64 *buf2, BUFF2[16 * 1024];
	mlib_u8 *tmp1, *tmp2;

	if (width * 3 > 16 * 1024) {
		tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7);
		buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7);
	} else {
		buf1 = (mlib_d64 *)BUFF1;
		buf2 = (mlib_d64 *)BUFF2;
	}

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp11 = sl11 = (mlib_u8 *)y;
	sp12 = sl12 = (mlib_u8 *)y + y_stride;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dp1 = (mlib_u8 *)buf1;
	dp2 = (mlib_u8 *)buf2;
	dl1 = (mlib_u8 *)rgb;
	dl2 = (mlib_u8 *)(rgb + rgb_stride);
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

/*
 * row loop
 */
	for (j = 0; j < height / 2; j++) {
		spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0);
		spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0);

		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		vis_alignaddr((void *)off2, 0);
		fu0 = (*dfu++);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = (*dfv++);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy0 = (*spy1++);
		dy4 = (*spy2++);

		dy3 = vis_ld_d64_nf(spy1); spy1++;
		vis_alignaddr(sp11, 0);
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

		dy5 = vis_ld_d64_nf(spy2); spy2++;
		vis_alignaddr(sp12, 0);
		dy2 = vis_faligndata(dy4, dy5);
		dy4 = dy5;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		vis_alignaddr((void *)off2, 0);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		vis_alignaddr((void *)off3, 0);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd01 = vis_bshuffle(red1, green1);
			dd02 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x03B04C05, 0);
			dd11 = vis_bshuffle(red1, green1);
			dd12 = vis_bshuffle(red2, green2);
			vis_write_bmask(0xD06E07F0, 0);
			dd21 = vis_bshuffle(red1, green1);
			dd22 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x01834967, 0);
			ddp1[0] = vis_bshuffle(dd01, blue1);
			ddp2[0] = vis_bshuffle(dd02, blue2);
			vis_write_bmask(0xA12B45C7, 0);
			ddp1[1] = vis_bshuffle(dd11, blue1);
			ddp2[1] = vis_bshuffle(dd12, blue2);
			vis_write_bmask(0x0D23E56F, 0);
			ddp1[2] = vis_bshuffle(dd21, blue1);
			ddp2[2] = vis_bshuffle(dd22, blue2);

			dy3 = vis_ld_d64_nf(spy1); spy1++;
			vis_alignaddr(sp11, 0);
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			dy5 = vis_ld_d64_nf(spy2); spy2++;
			vis_alignaddr(sp12, 0);
			dy2 = vis_faligndata(dy4, dy5);
			dy4 = dy5;

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
			z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
			z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green1 = vis_fpack16_to_hi(green1, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			red1 = vis_fpack16_to_hi(red1, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
			vis_alignaddr((void *)off3, 0);
			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			green1 = vis_fpack16_to_lo(green1, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
			red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

			green2 = vis_fpack16_to_hi(green2, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

			blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

			red2 = vis_fpack16_to_hi(red2, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

			green2 = vis_fpack16_to_lo(green2, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

			blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
			red2 = vis_fpack16_to_lo(red2, temp_r_lo);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

			ddp1 += 3;
			ddp2 += 3;
		}

		dp1 = (mlib_u8 *)ddp1;
		dp2 = (mlib_u8 *)ddp2;

		vis_alignaddr((void *)(width - i), 0);
		blue1 = vis_faligndata(blue1, blue1);
		green1 = vis_faligndata(green1, green1);
		red1 = vis_faligndata(red1, red1);
		dp1 += ((width - i - 1) * 3);

		blue2 = vis_faligndata(blue2, blue2);
		green2 = vis_faligndata(green2, green2);
		red2 = vis_faligndata(red2, red2);
		dp2 += ((width - i - 1) * 3);

		vis_alignaddr((void *)7, 0);
		for (; i < width; i++) {
			STORE_PIXEL1(0, 1, 2);
			STORE_PIXEL2(0, 1, 2);
			dp1 -= 3;
			dp2 -= 3;
		}

		sp11 = sl11 = sl11 + y_stride2;
		sp12 = sl12 = sl12 + y_stride2;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;
		__mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3);
		__mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3);

		dl1 = dp1 = dl1 + rgb_stride2;
		dl2 = dp2 = dl2 + rgb_stride2;
		dp1 = (mlib_u8 *)buf1;
		dp2 = (mlib_u8 *)buf2;
		ddp1 = (mlib_d64 *)dp1;
		ddp2 = (mlib_d64 *)dp2;
	}

	if (width * 3 > 16 * 1024) {
		__mlib_free(tmp1);
		__mlib_free(tmp2);
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB420_Nearest(
	mlib_u8 *rgb0,
	mlib_u8 *rgb1,
	const mlib_u8 *y0,
	const mlib_u8 *y1,
	const mlib_u8 *cb,
	const mlib_u8 *cr,
	mlib_s32 n)
{
/* pointers to dst address */
	mlib_u8 *dp1, *dp2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 du, dv;

/* (1.00000, 1.40200)*8192 */
	mlib_f32 k12 = vis_to_float(0x20002cdd);

/* (-.34414, -.71414)*8192 */
	mlib_f32 k34 = vis_to_float(0xf4fde926);

/* 1.77200*8192 */
	mlib_f32 k5 = vis_to_float(0x10038b4);

/* (179.45600 - 0.5)*32 */
	mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f);

/* (135.45984 + 0.5)*32 */
	mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff);

/* (226.81600 - 0.5)*32 */
	mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variable */
	mlib_s32 i;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;

	if (n <= 0)
		return (MLIB_FAILURE);

/*
 * initialize GSR scale factor
 */
	vis_write_gsr((2 << 3) + 7);

	dp1 = (mlib_u8 *)rgb0;
	dp2 = (mlib_u8 *)rgb1;
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

	spy1 = (mlib_d64 *)y0;
	spy2 = (mlib_d64 *)y1;
	dfu = (mlib_f32 *)cb;
	dfv = (mlib_f32 *)cr;

	fu = vis_ld_f32_nf(dfu);
	dfu++;
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	du = vis_fpmerge(fu, fu);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/* U*2.0184 */
	u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
	g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

	u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
	g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
	v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
	g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

	v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
	g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
	y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
	b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
	y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
	b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
	z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
	r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
	z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
	r_lo = vis_fpsub16(v_15966_lo, k_179_456);

	temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
	temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

	green1 = vis_fpack16_to_hi(green1, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

	blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
	fu = vis_ld_f32_nf(dfu);
	dfu++;

	red1 = vis_fpack16_to_hi(red1, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	green1 = vis_fpack16_to_lo(green1, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

	blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
	du = vis_fpmerge(fu, fu);

	red1 = vis_fpack16_to_lo(red1, temp_r_lo);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
	temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
	temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

	green2 = vis_fpack16_to_hi(green2, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

	blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

	red2 = vis_fpack16_to_hi(red2, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

	green2 = vis_fpack16_to_lo(green2, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

	blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
	red2 = vis_fpack16_to_lo(red2, temp_r_lo);

	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {

		vis_write_bmask(0x0801902A, 0);
		dd01 = vis_bshuffle(red1, green1);
		dd02 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x03B04C05, 0);
		dd11 = vis_bshuffle(red1, green1);
		dd12 = vis_bshuffle(red2, green2);
		vis_write_bmask(0xD06E07F0, 0);
		dd21 = vis_bshuffle(red1, green1);
		dd22 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x01834967, 0);
		ddp1[0] = vis_bshuffle(dd01, blue1);
		ddp2[0] = vis_bshuffle(dd02, blue2);
		vis_write_bmask(0xA12B45C7, 0);
		ddp1[1] = vis_bshuffle(dd11, blue1);
		ddp2[1] = vis_bshuffle(dd12, blue2);
		vis_write_bmask(0x0D23E56F, 0);
		ddp1[2] = vis_bshuffle(dd21, blue1);
		ddp2[2] = vis_bshuffle(dd22, blue2);

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_179_456);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		fu = vis_ld_f32_nf(dfu);
		dfu++;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		fv = vis_ld_f32_nf(dfv);
		dfv++;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		du = vis_fpmerge(fu, fu);

		red1 = vis_fpack16_to_lo(red1, temp_r_lo);
		dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy1 = vis_ld_d64_nf(spy1);
		spy1++;
		dy2 = vis_ld_d64_nf(spy2);
		spy2++;

		ddp1 += 3;
		ddp2 += 3;
	}

	dp1 = (mlib_u8 *)ddp1;
	dp2 = (mlib_u8 *)ddp2;

	vis_alignaddr((void *)(n - i), 0);
	blue1 = vis_faligndata(blue1, blue1);
	green1 = vis_faligndata(green1, green1);
	red1 = vis_faligndata(red1, red1);
	dp1 += ((n - i - 1) * 3);

	blue2 = vis_faligndata(blue2, blue2);
	green2 = vis_faligndata(green2, green2);
	red2 = vis_faligndata(red2, red2);
	dp2 += ((n - i - 1) * 3);

	vis_alignaddr((void *)7, 0);
	for (; i < n; i++) {
		STORE_PIXEL1(0, 1, 2);
		STORE_PIXEL2(0, 1, 2);
		dp1 -= 3;
		dp2 -= 3;
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoH263OverlappedMC_S16_U8(
	mlib_s16 mc_block[64],
	const mlib_u8 *ref_frame,
	mlib_s32 mch,
	mlib_s32 mcv,
	mlib_s32 mah,
	mlib_s32 mav,
	mlib_s32 mbh,
	mlib_s32 mbv,
	mlib_s32 mlh,
	mlib_s32 mlv,
	mlib_s32 mrh,
	mlib_s32 mrv,
	mlib_s32 ref_stride)
{
	mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9;
	mlib_d64 d10, d11, d12, d13, d14, d15;
	mlib_d64 tmp1, tmp2, tmp3;
	mlib_d64 dmask = vis_fexpand(vis_fones());
	mlib_d64 denom = vis_fandnot(dmask, vis_fpadd16(dmask, dmask));
	mlib_f32 reg_H0_00, reg_H0_01, reg_H0_10, reg_H0_20, reg_H0_21;
	mlib_f32 reg_H1_00, reg_H1_10, reg_H1_11, reg_H1_20, reg_H2_00;
	mlib_f32 reg_H2_01, reg_H2_10, reg_H2_11;
	mlib_f32 frnd;
	mlib_d64 *dp, *sd;
	const mlib_u8 *sp1, *sp2, *sp3, *sp4, *sp5;
	mlib_s32 ref_stride2 = ref_stride << 1, off;

	sp1 = (ref_frame + mch + mcv * ref_stride);
	sp2 = (ref_frame + mah + mav * ref_stride);
	sp3 = (ref_frame + mlh + mlv * ref_stride);
	sp4 = (ref_frame + mrh + 8 + mrv * ref_stride);
	sp5 = (ref_frame + mbh + (mbv + 8) * ref_stride);
	dp = (mlib_d64 *)mc_block;

	reg_H0_00 = vis_to_float(0x40505050);
	reg_H0_01 = vis_to_float(0x50505040);
	reg_H0_10 = vis_to_float(0x50505050);
	reg_H0_20 = vis_to_float(0x50506060);
	reg_H0_21 = vis_to_float(0x60605050);

	frnd = vis_to_float(0x20202020);

/*
 * central
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp1, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d0, tmp1, reg_H0_00);
	ACCSET(d1, tmp2, reg_H0_01);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d2, tmp1, reg_H0_10);
	ACCSET(d3, tmp2, reg_H0_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d4, tmp1, reg_H0_20);
	ACCSET(d5, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d6, tmp1, reg_H0_20);
	ACCSET(d7, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d8, tmp1, reg_H0_20);
	ACCSET(d9, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d10, tmp1, reg_H0_20);
	ACCSET(d11, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d12, tmp1, reg_H0_10);
	ACCSET(d13, tmp2, reg_H0_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d14, tmp1, reg_H0_00);
	ACCSET(d15, tmp2, reg_H0_01);

/*
 * left
 */
	reg_H2_00 = vis_to_float(0x20101010);
	reg_H2_01 = vis_to_float(0x10101020);
	reg_H2_10 = vis_to_float(0x20201010);
	reg_H2_11 = vis_to_float(0x10102020);

	off = (mlib_addr)sp3 & 7;
	sd = (mlib_d64 *)((mlib_u8 *)sp3 - off);
	vis_write_bmask(0x11111111 * off + 0x01234567, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d0, tmp1, reg_H2_00);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d2, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d4, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d6, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d8, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d10, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d12, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = vis_ld_d64_nf(sd + 1);
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d14, tmp1, reg_H2_00);

/*
 * right
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp4, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d1, tmp1, reg_H2_01);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d3, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d5, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d7, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d9, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d11, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d13, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = vis_ld_d64_nf(sd + 1);
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d15, tmp1, reg_H2_01);

/*
 * above
 */
	reg_H1_10 = vis_to_float(0x10102020);
	reg_H1_11 = vis_to_float(0x20201010);
	reg_H1_20 = vis_to_float(0x10101010);

	off = (mlib_addr)sp2 & 7;
	sd = (mlib_d64 *)((mlib_u8 *)sp2 - off);
	vis_write_bmask(0x11111111 * off + 0x01234567, 0);
	reg_H1_00 = vis_to_float(0x20202020);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[0], d0, tmp1, reg_H1_00);
	ACCPUT(dp[1], d1, tmp2, reg_H1_00);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[2], d2, tmp1, reg_H1_10);
	ACCPUT(dp[3], d3, tmp2, reg_H1_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[4], d4, tmp1, reg_H1_20);
	ACCPUT(dp[5], d5, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[6], d6, tmp1, reg_H1_20);
	ACCPUT(dp[7], d7, tmp2, reg_H1_20);

/*
 * below
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp5, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[8], d8, tmp1, reg_H1_20);
	ACCPUT(dp[9], d9, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[10], d10, tmp1, reg_H1_20);
	ACCPUT(dp[11], d11, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[12], d12, tmp1, reg_H1_10);
	ACCPUT(dp[13], d13, tmp2, reg_H1_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[14], d14, tmp1, reg_H1_00);
	ACCPUT(dp[15], d15, tmp2, reg_H1_00);

	return (MLIB_SUCCESS);
}
Пример #20
0
static void
mlib_v_VideoYUV2ABGR_aarray_411(
	mlib_u32 *abgr,
	const mlib_d64 *y,
	const mlib_f32 *u,
	const mlib_f32 *v,
	const mlib_d64 *a_array,
	mlib_s32 count,
	mlib_s32 left,
	mlib_s32 isrgb)
{
/* all. pointer to dst */
	mlib_d64 *dpp = (mlib_d64 *)abgr;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 *dpa, da0, da1, da2, da3, da4;
	mlib_d64 dtmp;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i;

	if (isrgb) {
		f0 = vis_to_float(0x12a1);
		f1 = vis_to_float(0x3317);
		f4 = vis_to_float(0xe5fa);
		f5 = vis_to_float(0xf375);
		f8 = vis_to_float(0x4097);
		doff0 = vis_to_double_dup(0xe420e420);
		doff1 = vis_to_double_dup(0x10f410f4);
		doff2 = vis_to_double_dup(0xdd60dd60);
	}

	dpa = vis_alignaddr((void *)a_array, 0);

	dy1 = (*y++);
	dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++;
	fu = (*u++);
	fv = (*v++);
	da2 = (*dpa++);
	da3 = vis_ld_d64_nf(dpa); dpa++;
	da4 = vis_ld_d64_nf(dpa); dpa++;

	du0 = vis_fmul8x16al(fu, f1);
	du1 = vis_fmul8x16al(fu, f4);
	dv1 = vis_fmul8x16al(fv, f5);
	dv2 = vis_fmul8x16al(fv, f8);

	if (!((mlib_addr)abgr & 7)) {
#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dpp[8 * i] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 1] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dpp[8 * i + 2] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 3] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dpp[8 * i + 4] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 5] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dpp[8 * i + 6] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 7] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	} else {
		mlib_d64 dd;

#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd);

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	}

	if (left) {
		mlib_d64 res_buf[8];

		da0 = vis_faligndata(da2, da3);
		da1 = vis_faligndata(da3, da4);

		ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
		ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

		ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
		ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

		db = vis_fpadd16(du0, doff0);

		dtmp = vis_fpadd16(du1, dv1);
		dg = vis_fpadd16(dtmp, doff1);

		dr = vis_fpadd16(dv2, doff2);

		db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
		db1 = vis_fpadd16(ddy1, db1);

		db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
		db2 = vis_fpadd16(ddy2, db2);

		db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
		db3 = vis_fpadd16(ddy3, db3);

		db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
		db4 = vis_fpadd16(ddy4, db4);

		dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
		dg1 = vis_fpadd16(ddy1, dg1);

		dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
		dg2 = vis_fpadd16(ddy2, dg2);

		dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
		dg3 = vis_fpadd16(ddy3, dg3);

		dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
		dg4 = vis_fpadd16(ddy4, dg4);

		dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
		dr1 = vis_fpadd16(ddy1, dr1);

		dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
		dr2 = vis_fpadd16(ddy2, dr2);

		dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
		dr3 = vis_fpadd16(ddy3, dr3);

		dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
		dr4 = vis_fpadd16(ddy4, dr4);

		dr = vis_fpack16_pair(dr1, dr2);
		dr1 = vis_fpack16_pair(dr3, dr4);

		dg = vis_fpack16_pair(dg1, dg2);
		dg1 = vis_fpack16_pair(dg3, dg4);

		db = vis_fpack16_pair(db1, db2);
		db1 = vis_fpack16_pair(db3, db4);

		dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
		dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

		res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
		dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

		res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
		dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

		res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
		dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

		res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		for (i = 0; i < left; i++)
			((mlib_f32 *)dpp)[16 * count + i] =
				((mlib_f32 *)res_buf)[i];
	}
}
Пример #21
0
mlib_status
__mlib_VideoIDCT8x8_S16_S16_B12(
	mlib_s16 *block,
	const mlib_s16 *coeffs)
{
	mlib_d64 *dPtr = (mlib_d64 *)coeffs;
	mlib_d64 *outPtr = (mlib_d64 *)block;
	mlib_d64 dx0, dx1, dx2, dx3, dx4, dx6, dx7, dx8;
	mlib_d64 p00, p10, p20, p30, p40, p50, p60, p70,
		p01, p11, p21, p31, p41, p51, p61, p71;
	mlib_d64 t0, t1;
	mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7;

	mlib_f32 COS_1_16;
	mlib_f32 COS_2_16;
	mlib_f32 COS_6_16;
	mlib_f32 COS_7_16;
	mlib_f32 COS_4_16;
	mlib_f32 C_1_4;

/* First pass */

	vis_write_bmask(0x018923ab, 0x0);

	LOAD_DATA_AA1 COS_1_16 = ((mlib_f32 *)mlib_cTable)[0];
	COS_2_16 = ((mlib_f32 *)mlib_cTable)[1];
	COS_6_16 = ((mlib_f32 *)mlib_cTable)[2];
	COS_7_16 = ((mlib_f32 *)mlib_cTable)[3];
	COS_4_16 = ((mlib_f32 *)mlib_cTable)[4];
	C_1_4 = ((mlib_f32 *)mlib_cTable)[5];

	TRANSPOSE_VIS2(p00, p10, p20, p30, d0, d1, d2, d3)
		TRANSPOSE_VIS2(p01, p11, p21, p31, d4, d5, d6, d7)
		LOAD_DATA_AA2 IDCT(d0, d1, d2, d3, d4, d5, d6, d7)
		TRANSPOSE_VIS2(p40, p50, p60, p70, d0, d1, d2, d3)
		p00 = vis_fpadd16(dx7, dx1);
	p10 = vis_fpadd16(dx3, dx2);
	p20 = vis_fpadd16(dx0, dx4);
	p30 = vis_fpadd16(dx8, dx6);
	p01 = vis_fpsub16(dx8, dx6);
	p11 = vis_fpsub16(dx0, dx4);
	p21 = vis_fpsub16(dx3, dx2);
	p31 = vis_fpsub16(dx7, dx1);

	TRANSPOSE_VIS2(p41, p51, p61, p71, d4, d5, d6, d7)
		IDCT(d0, d1, d2, d3, d4, d5, d6, d7)
		TRANSPOSE_VIS2(p00, p10, p20, p30, d0, d1, d2, d3)
		p40 = vis_fpadd16(dx7, dx1);
	p50 = vis_fpadd16(dx3, dx2);
	p60 = vis_fpadd16(dx0, dx4);
	p70 = vis_fpadd16(dx8, dx6);
	p41 = vis_fpsub16(dx8, dx6);
	p51 = vis_fpsub16(dx0, dx4);
	p61 = vis_fpsub16(dx3, dx2);
	p71 = vis_fpsub16(dx7, dx1);

/* Second pass */

	TRANSPOSE_VIS2(p40, p50, p60, p70, d4, d5, d6, d7)
		IDCT(d0, d1, d2, d3, d4, d5, d6, d7)
		TRANSPOSE_VIS2(p01, p11, p21, p31, d0, d1, d2, d3)
		outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
	outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
	outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
	outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
	outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
	outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
	outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
	outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

	TRANSPOSE_VIS2(p41, p51, p61, p71, d4, d5, d6, d7)
		IDCT(d0, d1, d2, d3, d4, d5, d6, d7)
		outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
	outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
	outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
	outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
	outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
	outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
	outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
	outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

	return (MLIB_SUCCESS);
}
Пример #22
0
mlib_status
__mlib_VideoInterpX_S16_U8(
	mlib_s16 *mc_block,
	const mlib_u8 *ref_block,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 frame_stride,
	mlib_s32 field_stride)
{
	mlib_s32 y;
	mlib_d64 *dd, *s0, ss0[MAXH * MAXW], *sp1, s1hi, s1lo, s2hi, s2lo, s2,
		s3;
	mlib_f32 strunc = vis_read_hi(*(mlib_d64 *)mlib_IX16const);
	mlib_f32 fexpd = vis_read_lo(*(mlib_d64 *)mlib_IX16const);

	dd = (mlib_d64 *)mc_block;

	sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0);
	s0 = ss0;

	if (width == 8) {
#pragma pipeloop(0)
		for (y = 0; y < height; y++) {
			s1hi = sp1[0];
			s1lo = sp1[1];
			*(s0++) = vis_faligndata(s1hi, s1lo);
			sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride);
		}

		sp1 = (mlib_d64 *)vis_alignaddr((void *)(ref_block + 1), 0);

		s0 = ss0;
#pragma pipeloop(0)
		for (y = 0; y < height; y++) {
			s2hi = sp1[0];
			s2lo = vis_ld_d64_nf(sp1 + 1);
			s2 = vis_faligndata(s2hi, s2lo);

			s1hi = vis_fexpand(vis_read_hi(*s0));
			s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd);
			s2hi = vis_fexpand(vis_read_hi(s2));
			s2lo = vis_fmul8x16al(vis_read_lo(s2), fexpd);

			s1hi = vis_fpadd16(s1hi, s2hi);
			s1lo = vis_fpadd16(s1lo, s2lo);

			s0++;
			dd[0] = vis_fmul8x16(strunc, s1hi);
			dd[1] = vis_fmul8x16(strunc, s1lo);
			sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride);
			dd = dd + 2;
		}
	} else {
/* if(width == 16) */

#pragma pipeloop(0)
		for (y = 0; y < height; y++) {
			s1hi = sp1[0];
			s1lo = sp1[1];
			s2 = sp1[2];
			*(s0++) = vis_faligndata(s1hi, s1lo);
			*(s0++) = vis_faligndata(s1lo, s2);
			sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride);
		}

		sp1 = (mlib_d64 *)vis_alignaddr((void *)(ref_block + 1), 0);

		s0 = ss0;
#pragma pipeloop(0)
		for (y = 0; y < height; y++) {
			s2hi = sp1[0];
			s2lo = sp1[1];
			s3 = vis_ld_d64_nf(sp1 + 2);
			s2 = vis_faligndata(s2hi, s2lo);
			s3 = vis_faligndata(s2lo, s3);

			s1hi = vis_fexpand(vis_read_hi(*s0));
			s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd);
			s2hi = vis_fexpand(vis_read_hi(s2));
			s2lo = vis_fmul8x16al(vis_read_lo(s2), fexpd);

			s1hi = vis_fpadd16(s1hi, s2hi);
			s1lo = vis_fpadd16(s1lo, s2lo);

			s0++;
			dd[0] = vis_fmul8x16(strunc, s1hi);
			dd[1] = vis_fmul8x16(strunc, s1lo);

			s1hi = vis_fexpand(vis_read_hi(*s0));
			s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd);
			s2hi = vis_fexpand(vis_read_hi(s3));
			s2lo = vis_fmul8x16al(vis_read_lo(s3), fexpd);

			s1hi = vis_fpadd16(s1hi, s2hi);
			s1lo = vis_fpadd16(s1lo, s2lo);

			s0++;
			dd[2] = vis_fmul8x16(strunc, s1hi);
			dd[3] = vis_fmul8x16(strunc, s1lo);

			sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride);
			dd = dd + 4;
		}
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
Пример #24
0
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 dst_stride,
	mlib_s32 src_stride)
{
	mlib_s32 x, y, x4 = width >> 2;
	mlib_d64 *sl1, *sl2, s1hi, s1lo, s2hi, s2lo, s1, s2;
	mlib_d64 done = vis_to_double_dup(0x1000100);
	mlib_d64 dmask;
	mlib_f32 *dp;
	mlib_f32 frnd = vis_to_float(0x40404040);
	mlib_s32 src_stride2 = 2 * src_stride;

	dmask = vis_fpadd16(done, vis_fone());
	vis_write_gsr(7 << 3);
	sl1 = (mlib_d64 *)src;
	sl2 = (mlib_d64 *)(src + src_stride);
	dp = (mlib_f32 *)dst;

	for (y = 0; y < height; y++) {
#pragma pipeloop(0)
		for (x = 0; x < x4; x++) {
			s1 = sl1[x];
			s2 = sl2[x];
			s1lo = vis_fand(s1, dmask);
			s1hi = vis_fmul8sux16(s1, done);
			s2lo = vis_fand(s2, dmask);
			s2hi = vis_fmul8sux16(s2, done);
			s1lo = vis_fpadd16(s1lo, s2lo);
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDCT16x16_S16_S16(
	mlib_s16 *coeffs,
	const mlib_s16 *block)
{
	mlib_s32 j;
	mlib_d64 val_m[16 * 4];
	mlib_d64 b0, b1, b2, b3, b4, b5, b6, b7, b8, b9;
	mlib_d64 b10, b11, b12, b13, b14, b15;
	mlib_d64 t0, t1, t2, t3, t4, t5, t6, t7, t9;
	mlib_d64 t10, t11, t12, t13, t14;
	mlib_d64 m02, m13, m0213, p0, p1, p2, p3;

	mlib_d64 c1, c2, c3, c4;

	mlib_f32 COS_4 = ((mlib_f32 *)mlib_dct16vtab)[0];
	mlib_f32 SIN_8 = ((mlib_f32 *)mlib_dct16vtab)[1];
	mlib_f32 COS_8 = ((mlib_f32 *)mlib_dct16vtab)[2];
	mlib_f32 SIN_16 = ((mlib_f32 *)mlib_dct16vtab)[3];
	mlib_f32 COS_16 = ((mlib_f32 *)mlib_dct16vtab)[4];
	mlib_f32 COS_3_16 = ((mlib_f32 *)mlib_dct16vtab)[5];
	mlib_f32 SIN_3_16 = ((mlib_f32 *)mlib_dct16vtab)[6];

	mlib_f32 SIN_32 = ((mlib_f32 *)mlib_dct16vtab)[7];
	mlib_f32 COS_32 = ((mlib_f32 *)mlib_dct16vtab)[8];
	mlib_f32 COS_3_32 = ((mlib_f32 *)mlib_dct16vtab)[9];
	mlib_f32 SIN_3_32 = ((mlib_f32 *)mlib_dct16vtab)[10];
	mlib_f32 COS_5_32 = ((mlib_f32 *)mlib_dct16vtab)[11];
	mlib_f32 SIN_5_32 = ((mlib_f32 *)mlib_dct16vtab)[12];
	mlib_f32 COS_7_32 = ((mlib_f32 *)mlib_dct16vtab)[13];
	mlib_f32 SIN_7_32 = ((mlib_f32 *)mlib_dct16vtab)[14];

	mlib_f32 fscale = ((mlib_f32 *)mlib_dct16vtab)[15];

	mlib_d64 *bptr = (mlib_d64 *)block;
	mlib_d64 *coeffs64 = (mlib_d64 *)coeffs;

/*
 * first column based 1-D 16x16 DCT
 */

#pragma pipeloop(0)
	for (j = 0; j < 4; j++) {

/*
 * first butter-fly
 */
		b0 = vis_fpadd16(bptr[j], bptr[j + 4 * 15]);
		b15 = vis_fpsub16(bptr[j], bptr[j + 4 * 15]);

		b1 = vis_fpadd16(bptr[j + 4 * 1], bptr[j + 4 * 14]);
		b14 = vis_fpsub16(bptr[j + 4 * 1], bptr[j + 4 * 14]);

		b2 = vis_fpadd16(bptr[j + 4 * 2], bptr[j + 4 * 13]);
		b13 = vis_fpsub16(bptr[j + 4 * 2], bptr[j + 4 * 13]);

		b3 = vis_fpadd16(bptr[j + 4 * 3], bptr[j + 4 * 12]);
		b12 = vis_fpsub16(bptr[j + 4 * 3], bptr[j + 4 * 12]);

		b4 = vis_fpadd16(bptr[j + 4 * 4], bptr[j + 4 * 11]);
		b11 = vis_fpsub16(bptr[j + 4 * 4], bptr[j + 4 * 11]);

		b5 = vis_fpadd16(bptr[j + 4 * 5], bptr[j + 4 * 10]);
		b10 = vis_fpsub16(bptr[j + 4 * 5], bptr[j + 4 * 10]);

		b6 = vis_fpadd16(bptr[j + 4 * 6], bptr[j + 4 * 9]);
		b9 = vis_fpsub16(bptr[j + 4 * 6], bptr[j + 4 * 9]);

		b7 = vis_fpadd16(bptr[j + 4 * 7], bptr[j + 4 * 8]);
		b8 = vis_fpsub16(bptr[j + 4 * 7], bptr[j + 4 * 8]);

/*
 * second butter-fly
 */
		t0 = vis_fpadd16(b0, b7);
		t1 = vis_fpadd16(b1, b6);
		t2 = vis_fpadd16(b2, b5);
		t3 = vis_fpadd16(b3, b4);
		t4 = vis_fpsub16(b3, b4);
		t5 = vis_fpsub16(b2, b5);
		t6 = vis_fpsub16(b1, b6);
		t7 = vis_fpsub16(b0, b7);
		c1 = vis_fpsub16(b13, b10);
		c2 = vis_fpsub16(b12, b11);
		c3 = vis_fpadd16(b11, b12);
		c4 = vis_fpadd16(b10, b13);
		t10 = vis_fmul8x16(FCOS_4, c1);
		t11 = vis_fmul8x16(FCOS_4, c2);
		t12 = vis_fmul8x16(FCOS_4, c3);
		t13 = vis_fmul8x16(FCOS_4, c4);

/*
 * third butter-fly
 */
		b0 = vis_fpadd16(t0, t3);
		b1 = vis_fpadd16(t1, t2);
		b2 = vis_fpsub16(t1, t2);
		b3 = vis_fpsub16(t0, t3);
		c1 = vis_fpsub16(t6, t5);
		c2 = vis_fpadd16(t6, t5);
		b5 = vis_fmul8x16(FCOS_4, c1);
		b6 = vis_fmul8x16(FCOS_4, c2);
		b11 = vis_fpsub16(b8, t11);
		b8 = vis_fpadd16(b8, t11);
		b10 = vis_fpsub16(b9, t10);
		b9 = vis_fpadd16(b9, t10);
		b12 = vis_fpsub16(b15, t12);
		b13 = vis_fpsub16(b14, t13);
		b14 = vis_fpadd16(b14, t13);
		b15 = vis_fpadd16(b15, t12);

/*
 * fourth butter-fly
 */
		c1 = vis_fpadd16(b0, b1);
		c2 = vis_fpsub16(b0, b1);
		p0 = vis_fmul8x16(COS_4, c1);
		p2 = vis_fmul8x16(COS_4, c2);
		c1 = vis_fmul8x16(SIN_8, b2);
		c2 = vis_fmul8x16(COS_8, b3);
		c3 = vis_fmul8x16(SIN_8, b3);
		c4 = vis_fmul8x16(COS_8, b2);
		p1 = vis_fpadd16(c1, c2);
		p3 = vis_fpsub16(c3, c4);
		MLIB_PTRANSPOSE16_4x4(val_m, 16 * j);
		t5 = vis_fpsub16(t4, b5);
		t4 = vis_fpadd16(t4, b5);
		t6 = vis_fpsub16(t7, b6);
		t7 = vis_fpadd16(t7, b6);
		c1 = vis_fmul8x16(FSIN_8, b14);
		c2 = vis_fmul8x16(FCOS_8, b9);
		c3 = vis_fmul8x16(FSIN_8, b10);
		c4 = vis_fmul8x16(FCOS_8, b13);
		t9 = vis_fpsub16(c1, c2);
		t10 = vis_fpadd16(c3, c4);
		c1 = vis_fmul8x16(FSIN_8, b13);
		c2 = vis_fmul8x16(FCOS_8, b10);
		c3 = vis_fmul8x16(FSIN_8, b9);
		c4 = vis_fmul8x16(FCOS_8, b14);
		t13 = vis_fpsub16(c1, c2);
		t14 = vis_fpadd16(c3, c4);

/*
 * fifth butter-fly
 */
		c1 = vis_fmul8x16(SIN_16, t4);
		c2 = vis_fmul8x16(COS_16, t7);
		c3 = vis_fmul8x16(COS_3_16, t5);
		c4 = vis_fmul8x16(SIN_3_16, t6);
		p0 = vis_fpadd16(c1, c2);
		p2 = vis_fpadd16(c3, c4);
		c1 = vis_fmul8x16(COS_3_16, t6);
		c2 = vis_fmul8x16(SIN_3_16, t5);
		c3 = vis_fmul8x16(SIN_16, t7);
		c4 = vis_fmul8x16(COS_16, t4);
		p1 = vis_fpsub16(c1, c2);
		p3 = vis_fpsub16(c3, c4);
		MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 2);
		b9 = vis_fpsub16(b8, t9);
		b8 = vis_fpadd16(b8, t9);
		b10 = vis_fpadd16(b11, t10);
		b11 = vis_fpsub16(b11, t10);
		b13 = vis_fpsub16(b12, t13);
		b12 = vis_fpadd16(b12, t13);
		b14 = vis_fpsub16(b15, t14);
		b15 = vis_fpadd16(b15, t14);

/*
 * sixth butter-fly
 */
		c1 = vis_fmul8x16(SIN_32, b8);
		c2 = vis_fmul8x16(COS_32, b15);
		c3 = vis_fmul8x16(COS_7_32, b9);
		c4 = vis_fmul8x16(SIN_7_32, b14);
		p0 = vis_fpadd16(c1, c2);
		p2 = vis_fpadd16(c3, c4);

		c1 = vis_fmul8x16(SIN_5_32, b10);
		c2 = vis_fmul8x16(COS_5_32, b13);
		c3 = vis_fmul8x16(COS_3_32, b11);
		c4 = vis_fmul8x16(SIN_3_32, b12);
		p1 = vis_fpadd16(c1, c2);
		p3 = vis_fpadd16(c3, c4);
		MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 1);

		c1 = vis_fmul8x16(COS_3_32, b12);
		c2 = vis_fmul8x16(SIN_3_32, b11);
		c3 = vis_fmul8x16(SIN_5_32, b13);
		c4 = vis_fmul8x16(COS_5_32, b10);
		p0 = vis_fpsub16(c1, c2);
		p2 = vis_fpsub16(c3, c4);

		c1 = vis_fmul8x16(COS_7_32, b14);
		c2 = vis_fmul8x16(SIN_7_32, b9);
		c3 = vis_fmul8x16(SIN_32, b15);
		c4 = vis_fmul8x16(COS_32, b8);
		p1 = vis_fpsub16(c1, c2);
		p3 = vis_fpsub16(c3, c4);
		MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 3);
	}

/*
 * then row based 1-D 16x16 DCT
 */

#pragma pipeloop(0)
	for (j = 0; j < 4; j++) {

/*
 * first butter-fly
 */
		b0 = vis_fpadd16(val_m[j], val_m[j + 4 * 15]);
		b15 = vis_fpsub16(val_m[j], val_m[j + 4 * 15]);

		b1 = vis_fpadd16(val_m[j + 4 * 1], val_m[j + 4 * 14]);
		b14 = vis_fpsub16(val_m[j + 4 * 1], val_m[j + 4 * 14]);

		b2 = vis_fpadd16(val_m[j + 4 * 2], val_m[j + 4 * 13]);
		b13 = vis_fpsub16(val_m[j + 4 * 2], val_m[j + 4 * 13]);

		b3 = vis_fpadd16(val_m[j + 4 * 3], val_m[j + 4 * 12]);
		b12 = vis_fpsub16(val_m[j + 4 * 3], val_m[j + 4 * 12]);

		b4 = vis_fpadd16(val_m[j + 4 * 4], val_m[j + 4 * 11]);
		b11 = vis_fpsub16(val_m[j + 4 * 4], val_m[j + 4 * 11]);

		b5 = vis_fpadd16(val_m[j + 4 * 5], val_m[j + 4 * 10]);
		b10 = vis_fpsub16(val_m[j + 4 * 5], val_m[j + 4 * 10]);

		b6 = vis_fpadd16(val_m[j + 4 * 6], val_m[j + 4 * 9]);
		b9 = vis_fpsub16(val_m[j + 4 * 6], val_m[j + 4 * 9]);

		b7 = vis_fpadd16(val_m[j + 4 * 7], val_m[j + 4 * 8]);
		b8 = vis_fpsub16(val_m[j + 4 * 7], val_m[j + 4 * 8]);

/*
 * second butter-fly
 */
		t0 = vis_fpadd16(b0, b7);
		t1 = vis_fpadd16(b1, b6);
		t2 = vis_fpadd16(b2, b5);
		t3 = vis_fpadd16(b3, b4);
		t4 = vis_fpsub16(b3, b4);
		t5 = vis_fpsub16(b2, b5);
		t6 = vis_fpsub16(b1, b6);
		t7 = vis_fpsub16(b0, b7);
		c1 = vis_fpsub16(b13, b10);
		c2 = vis_fpsub16(b12, b11);
		c3 = vis_fpadd16(b11, b12);
		c4 = vis_fpadd16(b10, b13);
		t10 = vis_fmul8x16(FCOS_4, c1);
		t11 = vis_fmul8x16(FCOS_4, c2);
		t12 = vis_fmul8x16(FCOS_4, c3);
		t13 = vis_fmul8x16(FCOS_4, c4);

/*
 * third butter-fly
 */
		b0 = vis_fpadd16(t0, t3);
		b1 = vis_fpadd16(t1, t2);
		b2 = vis_fpsub16(t1, t2);
		b3 = vis_fpsub16(t0, t3);
		c1 = vis_fpsub16(t6, t5);
		c2 = vis_fpadd16(t6, t5);
		b5 = vis_fmul8x16(FCOS_4, c1);
		b6 = vis_fmul8x16(FCOS_4, c2);
		b11 = vis_fpsub16(b8, t11);
		b8 = vis_fpadd16(b8, t11);
		b10 = vis_fpsub16(b9, t10);
		b9 = vis_fpadd16(b9, t10);
		b12 = vis_fpsub16(b15, t12);
		b13 = vis_fpsub16(b14, t13);
		b14 = vis_fpadd16(b14, t13);
		b15 = vis_fpadd16(b15, t12);

/*
 * fourth butter-fly
 */
		b0 = vis_fmul8x16(COS_4, b0);
		b1 = vis_fmul8x16(COS_4, b1);
		c1 = vis_fpadd16(b0, b1);
		c2 = vis_fpsub16(b0, b1);
		b0 = SCALE8(c1);
		b1 = SCALE8(c2);
		c1 = vis_fmul8x16(SIN_8, b2);
		c2 = vis_fmul8x16(COS_8, b3);
		c3 = vis_fmul8x16(SIN_8, b3);
		c4 = vis_fmul8x16(COS_8, b2);
		b2 = SCALE8(vis_fpadd16(c1, c2));
		b3 = SCALE8(vis_fpsub16(c3, c4));
		t5 = vis_fpsub16(t4, b5);
		t4 = vis_fpadd16(t4, b5);
		t6 = vis_fpsub16(t7, b6);
		t7 = vis_fpadd16(t7, b6);
		c1 = vis_fmul8x16(FSIN_8, b14);
		c2 = vis_fmul8x16(FCOS_8, b9);
		c3 = vis_fmul8x16(FSIN_8, b10);
		c4 = vis_fmul8x16(FCOS_8, b13);
		t9 = vis_fpsub16(c1, c2);
		t10 = vis_fpadd16(c3, c4);
		c1 = vis_fmul8x16(FSIN_8, b13);
		c2 = vis_fmul8x16(FCOS_8, b10);
		c3 = vis_fmul8x16(FSIN_8, b9);
		c4 = vis_fmul8x16(FCOS_8, b14);
		t13 = vis_fpsub16(c1, c2);
		t14 = vis_fpadd16(c3, c4);

/*
 * fifth butter-fly
 */
		c1 = vis_fmul8x16(COS_3_16, t6);
		c2 = vis_fmul8x16(SIN_3_16, t5);
		c3 = vis_fmul8x16(COS_3_16, t5);
		c4 = vis_fmul8x16(SIN_3_16, t6);
		b6 = SCALE8(vis_fpsub16(c1, c2));
		b5 = SCALE8(vis_fpadd16(c3, c4));
		b9 = vis_fpsub16(b8, t9);
		b8 = vis_fpadd16(b8, t9);
		b10 = vis_fpadd16(b11, t10);
		b11 = vis_fpsub16(b11, t10);
		b13 = vis_fpsub16(b12, t13);
		b12 = vis_fpadd16(b12, t13);
		b14 = vis_fpsub16(b15, t14);
		b15 = vis_fpadd16(b15, t14);

/*
 * sixth butter-fly
 */
		c1 = vis_fmul8x16(COS_7_32, b14);
		c2 = vis_fmul8x16(SIN_7_32, b9);
		c3 = vis_fmul8x16(COS_7_32, b9);
		c4 = vis_fmul8x16(SIN_7_32, b14);
		b14 = SCALE8(vis_fpsub16(c1, c2));
		b9 = SCALE8(vis_fpadd16(c3, c4));

		c1 = vis_fmul8x16(SIN_5_32, b10);
		c2 = vis_fmul8x16(COS_5_32, b13);
		c3 = vis_fmul8x16(SIN_5_32, b13);
		c4 = vis_fmul8x16(COS_5_32, b10);
		b10 = SCALE8(vis_fpadd16(c1, c2));
		b13 = SCALE8(vis_fpsub16(c3, c4));

		MLIB_XTRANSPOSE16_4x4(b1, b9, b5, b13, coeffs64, 4 * j + 2);
		MLIB_XTRANSPOSE16_4x4(b2, b10, b6, b14, coeffs64, 4 * j + 1);

		c1 = vis_fmul8x16(SIN_16, t4);
		c2 = vis_fmul8x16(COS_16, t7);
		c3 = vis_fmul8x16(SIN_16, t7);
		c4 = vis_fmul8x16(COS_16, t4);
		b4 = SCALE8(vis_fpadd16(c1, c2));
		b7 = SCALE8(vis_fpsub16(c3, c4));

		c1 = vis_fmul8x16(SIN_32, b8);
		c2 = vis_fmul8x16(COS_32, b15);
		c3 = vis_fmul8x16(SIN_32, b15);
		c4 = vis_fmul8x16(COS_32, b8);
		b8 = SCALE8(vis_fpadd16(c1, c2));
		b15 = SCALE8(vis_fpsub16(c3, c4));

		c1 = vis_fmul8x16(COS_3_32, b12);
		c2 = vis_fmul8x16(SIN_3_32, b11);
		c3 = vis_fmul8x16(COS_3_32, b11);
		c4 = vis_fmul8x16(SIN_3_32, b12);
		b12 = SCALE8(vis_fpsub16(c1, c2));
		b11 = SCALE8(vis_fpadd16(c3, c4));

		MLIB_XTRANSPOSE16_4x4(b0, b8, b4, b12, coeffs64, 4 * j);
		MLIB_XTRANSPOSE16_4x4(b3, b11, b7, b15, coeffs64, 4 * j + 3);
	}

	return (MLIB_SUCCESS);
}
Пример #27
0
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
Пример #28
0
static mlib_status
mlib_v_VideoColorYUV2RGB411_nonalign(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *u,
    const mlib_u8 *v,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 rgb_stride,
    mlib_s32 y_stride,
    mlib_s32 uv_stride)
{
    /* pointers to src address */
    mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

    /* pointers to dst address */
    mlib_u8 *dp, *dl;

    /* all. pointer to y */
    mlib_d64 *spy;

    /* all. pointers to u, v */
    mlib_d64 *dfu, *dfv;

    /* u, v data */
    mlib_f32 fu, fv;

    /* y data */
    mlib_d64 dy0, dy1, dy2, dy3;
    mlib_d64 ddy1, ddy2, ddy3, ddy4;
    mlib_d64 du0, du1, fu0, fu1;
    mlib_d64 dv1, dv2, fv0, fv1;
    mlib_d64 dr, dr1, dr2, dr3, dr4;
    mlib_d64 dg, dg1, dg2, dg3, dg4;
    mlib_d64 db, db1, db2, db3, db4;
    mlib_d64 dtmp;

    /* 1.1644  * 4096 */
    mlib_f32 f0 = vis_to_float(0x12a1);

    /* 2.0184  * 8192 */
    mlib_f32 f1 = vis_to_float(0x4097);

    /* -0.3920 * 8192 */
    mlib_f32 f4 = vis_to_float(0xf375);

    /* -0.8132 * 8192 */
    mlib_f32 f5 = vis_to_float(0xe5fa);

    /* 1.5966  * 8192 */
    mlib_f32 f8 = vis_to_float(0x3317);

    /* -276.9856 * 32 */
    mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

    /* 135.6352  * 32 */
    mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

    /* -222.9952 * 32 */
    mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
    mlib_f32 fscale = vis_to_float(0x80808080);

    /* loop variable */
    mlib_s32 i, j;
    mlib_d64 *buf, BUFF[16 * 1024];
    mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22;
    mlib_u8 *tmp;

    if (width * 3 > 16 * 1024) {
        tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
        buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
    } else {
        buf = (mlib_d64 *)BUFF;
    }

    /*
     * initialize GSR scale factor
     */
    vis_write_gsr(3 << 3);

    sp1 = sl1 = (mlib_u8 *)y;
    sp2 = sl2 = (mlib_u8 *)u;
    sp3 = sl3 = (mlib_u8 *)v;

    dp = (mlib_u8 *)buf;
    dl = rgb;
    ddp = (mlib_d64 *)dp;

    /*
     * row loop
     */
    for (j = 0; j < height; j++) {
        spy = (mlib_d64 *)vis_alignaddr(sp1, 0);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = (*dfu++);
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = (*dfv++);
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        dy0 = (*spy++);
        dy3 = vis_ld_d64_nf(spy);
        spy++;
        vis_alignaddr(sp1, 0);
        dy1 = vis_faligndata(dy0, dy3);
        dy0 = vis_ld_d64_nf(spy);
        spy++;
        dy2 = vis_faligndata(dy3, dy0);

        du0 = vis_fmul8x16al(fu, f1);
        db = vis_fpadd16(du0, doff0);

        du1 = vis_fmul8x16al(fu, f4);
        dv1 = vis_fmul8x16al(fv, f5);
        dtmp = vis_fpadd16(du1, dv1);
        dg = vis_fpadd16(dtmp, doff1);

        dv2 = vis_fmul8x16al(fv, f8);
        dr = vis_fpadd16(dv2, doff2);

        ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
        ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

        ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
        ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

        db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
        db1 = vis_fpadd16(ddy1, db1);

        db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
        db2 = vis_fpadd16(ddy2, db2);

        db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
        db3 = vis_fpadd16(ddy3, db3);

        db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
        db4 = vis_fpadd16(ddy4, db4);

        dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
        dg1 = vis_fpadd16(ddy1, dg1);

        dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
        dg2 = vis_fpadd16(ddy2, dg2);

        dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
        dg3 = vis_fpadd16(ddy3, dg3);

        dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
        dg4 = vis_fpadd16(ddy4, dg4);

        dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
        dr1 = vis_fpadd16(ddy1, dr1);

        dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
        dr2 = vis_fpadd16(ddy2, dr2);

        dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
        dr3 = vis_fpadd16(ddy3, dr3);

        dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
        dr4 = vis_fpadd16(ddy4, dr4);

        db = vis_fpack16_pair(db1, db2);
        db1 = vis_fpack16_pair(db3, db4);

        dr = vis_fpack16_pair(dr1, dr2);
        dr1 = vis_fpack16_pair(dr3, dr4);

        dg = vis_fpack16_pair(dg1, dg2);
        dg1 = vis_fpack16_pair(dg3, dg4);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = vis_ld_d64_nf(dfu);
        dfu++;
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = vis_ld_d64_nf(dfv);
        dfv++;
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        /*
         * 16-pixel column loop
         */
#pragma pipeloop(0)
        for (i = 0; i <= width - 16; i += 16) {

            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            dd02 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            dd12 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            dd22 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            ddp[3] = vis_bshuffle(dd02, db1);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            ddp[4] = vis_bshuffle(dd12, db1);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            ddp[5] = vis_bshuffle(dd22, db1);

            dy3 = vis_ld_d64_nf(spy);
            spy++;
            vis_alignaddr(sp1, 0);
            dy1 = vis_faligndata(dy0, dy3);
            dy0 = vis_ld_d64_nf(spy);
            spy++;
            dy2 = vis_faligndata(dy3, dy0);

            du0 = vis_fmul8x16al(fu, f1);
            db = vis_fpadd16(du0, doff0);

            du1 = vis_fmul8x16al(fu, f4);
            dv1 = vis_fmul8x16al(fv, f5);
            dtmp = vis_fpadd16(du1, dv1);
            dg = vis_fpadd16(dtmp, doff1);

            dv2 = vis_fmul8x16al(fv, f8);
            dr = vis_fpadd16(dv2, doff2);

            ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
            ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

            ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
            ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

            db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
            db1 = vis_fpadd16(ddy1, db1);

            db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
            db2 = vis_fpadd16(ddy2, db2);

            db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
            db3 = vis_fpadd16(ddy3, db3);

            db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
            db4 = vis_fpadd16(ddy4, db4);

            dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
            dg1 = vis_fpadd16(ddy1, dg1);

            dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
            dg2 = vis_fpadd16(ddy2, dg2);

            dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
            dg3 = vis_fpadd16(ddy3, dg3);

            dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
            dg4 = vis_fpadd16(ddy4, dg4);

            dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
            dr1 = vis_fpadd16(ddy1, dr1);

            dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
            dr2 = vis_fpadd16(ddy2, dr2);

            dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
            dr3 = vis_fpadd16(ddy3, dr3);

            dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
            dr4 = vis_fpadd16(ddy4, dr4);

            db = vis_fpack16_pair(db1, db2);
            db1 = vis_fpack16_pair(db3, db4);

            dr = vis_fpack16_pair(dr1, dr2);
            dr1 = vis_fpack16_pair(dr3, dr4);

            dg = vis_fpack16_pair(dg1, dg2);
            dg1 = vis_fpack16_pair(dg3, dg4);

            dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
            fu0 = vis_ld_d64_nf(dfu);
            dfu++;
            fu1 = vis_ld_d64_nf(dfu);
            dfu++;
            fu = vis_read_hi(vis_faligndata(fu0, fu1));
            sp2 += 4;

            dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
            fv0 = vis_ld_d64_nf(dfv);
            dfv++;
            fv1 = vis_ld_d64_nf(dfv);
            dfv++;
            fv = vis_read_hi(vis_faligndata(fv0, fv1));
            sp3 += 4;

            ddp += 6;
        }

        if (i <= width - 8) {
            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            db = db1;
            dr = dr1;
            dg = dg1;
            ddp += 3;
            i += 8;
        }

        dp = (mlib_u8 *)ddp;

        vis_alignaddr((void *)(width - i), 0);
        db = vis_faligndata(db, db);
        dg = vis_faligndata(dg, dg);
        dr = vis_faligndata(dr, dr);
        dp += ((width - i - 1) * 3);

        vis_alignaddr((void *)7, 0);
        for (; i < width; i++) {
            STORE_PIXEL(0, 1, 2);
            dp -= 3;
        }

        sp1 = sl1 = sl1 + y_stride;
        sp2 = sl2 = sl2 + uv_stride;
        sp3 = sl3 = sl3 + uv_stride;
        __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3);

        dl = dp = dl + rgb_stride;
        dp = (mlib_u8 *)buf;
        ddp = (mlib_d64 *)dp;
    }

    if (width * 3 > 16 * 1024)
        __mlib_free(tmp);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoIDCT8x8_S16_S16_Q1_Mismatch(
	mlib_s16 *block,
	const mlib_s16 *coeffs)
{
	mlib_d64 *dPtr = (mlib_d64 *)coeffs;
	mlib_d64 *outPtr = (mlib_d64 *)block;
	mlib_d64 dx0, dx1, dx2, dx3, dx4, dx6, dx7, dx8;
	mlib_d64 p00, p10, p20, p30, p01, p11, p21, p31, p40, p50, p60, p70;
	mlib_d64 p41, p51, p61, p71;
	mlib_d64 t0, t1;
	mlib_d64 d0, d1, d2, d3, d7, zero = vis_fzero();

	mlib_f32 COS_1_16;
	mlib_f32 COS_2_16;
	mlib_f32 COS_6_16;
	mlib_f32 COS_7_16;
	mlib_f32 COS_4_16;
	mlib_f32 C_1_4;

/* First pass */

	LOAD_DATA_AA44;

	COS_1_16 = ((mlib_f32 *)mlib_cTable)[0];
	COS_2_16 = ((mlib_f32 *)mlib_cTable)[1];
	COS_6_16 = ((mlib_f32 *)mlib_cTable)[2];
	COS_7_16 = ((mlib_f32 *)mlib_cTable)[3];
	COS_4_16 = ((mlib_f32 *)mlib_cTable)[4];
	C_1_4 = ((mlib_f32 *)mlib_cTable)[5];

	TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)

		LOAD_MISMATCH;

	IDCT44(d0, d1, d2, d3);

	p00 = vis_fpadd16(dx7, dx1);
	p10 = vis_fpadd16(dx3, dx2);
	p20 = vis_fpadd16(dx0, dx4);
	p30 = vis_fpadd16(dx8, dx6);
	p40 = vis_fpsub16(dx8, dx6);
	p50 = vis_fpsub16(dx0, dx4);
	p60 = vis_fpsub16(dx3, dx2);
	p70 = vis_fpsub16(dx7, dx1);

/* Special case when element#63 == 1 */

	if (coeffs[63] != 1) {
		IDCTS(zero, d7);
		p01 = dx1;
		p11 = dx2;
		p21 = dx4;
		p31 = dx6;
		p41 = vis_fpsub16(zero, dx6);
		p51 = vis_fpsub16(zero, dx4);
		p61 = vis_fpsub16(zero, dx2);
		p71 = vis_fpsub16(zero, dx1);

		TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)
			TRANSPOSE1(p01, p11, p21, p31, d7)

/* Second pass */
			IDCTS1(d0, d1, d2, d3, d7);
		TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3)
			outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		TRANSPOSE1(p41, p51, p61, p71, d7)
			IDCTS1(d0, d1, d2, d3, d7);
		outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		return (MLIB_SUCCESS);
	} else {
/* Second pass */
		TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)
			d7 = *((mlib_d64 *)&val0);

		IDCTS1(d0, d1, d2, d3, d7);
		TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3)
			outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		d7 = *((mlib_d64 *)&val1);
		IDCTS1(d0, d1, d2, d3, d7);
		outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		return (MLIB_SUCCESS);
	}
}
Пример #30
0
mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
                                  const mlib_image *src,
                                  mlib_s32         m,
                                  mlib_s32         n,
                                  mlib_s32         dm,
                                  mlib_s32         dn,
                                  const mlib_s32   *kern,
                                  mlib_s32         scale,
                                  mlib_s32         cmask)
{
  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
  mlib_d64 dd, d0, d1;
  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
  mlib_u8 *sl, *sp, *dl;
  mlib_s32 hgt = mlib_ImageGetHeight(src);
  mlib_s32 wid = mlib_ImageGetWidth(src);
  mlib_s32 sll = mlib_ImageGetStride(src);
  mlib_s32 dll = mlib_ImageGetStride(dst);
  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
  mlib_d64 *pbuff, *dp;
  mlib_f32 *karr = (mlib_f32 *) kern;
  mlib_s32 gsr_scale = (31 - scale) << 3;
  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
  mlib_s32 i, j, l, chan, testchan;
  mlib_s32 nchan = mlib_ImageGetChannels(dst);
  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);

  if (n > MAX_N) {
    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));

    if (buffs == NULL)
      return MLIB_FAILURE;
  }

  buff = buffs + 2 * (n + 1);

  adr_dst += dn * dll + dm * nchan;

  ssize = wid;
  dsize = (ssize + 7) / 8;
  esize = dsize + 4;
  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));

  if (pbuff == NULL) {
    if (buffs != buffs_local)
      mlib_free(buffs);
    return MLIB_FAILURE;
  }

  for (i = 0; i < (n + 1); i++)
    buffs[i] = pbuff + i * esize;
  for (i = 0; i < (n + 1); i++)
    buffs[(n + 1) + i] = buffs[i];
  buffd = buffs[n] + esize;
  buffe = buffd + 2 * esize;

  hgt -= (n - 1);
  xsize = ssize - (m - 1);

  vis_write_gsr(gsr_scale + 7);

  if (nchan == 2) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
  }
  else if (nchan == 3) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
  }
  else {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
  }

  testchan = 1;
  for (chan = 0; chan < nchan; chan++) {
    buff_ind = 0;
    sl = adr_src;
    dl = adr_dst;

    if ((cmask & testchan) == 0) {
      testchan <<= 1;
      continue;
    }

    for (l = 0; l < n; l++) {
      mlib_d64 *buffn = buffs[l];
      sp = sl + l * sll;

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
    }

    /* init buffer */
#pragma pipeloop(0)
    for (i = 0; i < (xsize + 7) / 8; i++) {
      buffd[2 * i] = drnd;
      buffd[2 * i + 1] = drnd;
    }

    for (j = 0; j < hgt; j++) {
      mlib_d64 **buffc = buffs + buff_ind;
      mlib_f32 *pk = karr, k0, k1, k2, k3;
      sp = sl + n * sll;

      for (l = 0; l < n; l++) {
        buff[l] = buffc[l];
      }

      buffn = buffc[n];

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);

      ik_last = (m - 1);

      for (jk = 0; jk < n; jk += jk_size) {
        jk_size = n - jk;

        if (jk_size >= 6)
          jk_size = 4;

        if (jk_size == 5)
          jk_size = 3;

        coff = 0;

        if (jk_size == 1) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];

            doff = coff / 8;
            buff0 = buff[jk] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s01 = buff0[i + 1];
              s0 = vis_faligndata(s00, s01);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d1 = vis_fpadd16(d01, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += m;
        }
        else if (jk_size == 2) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
            s11 = buff1[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s10 = s11;
              s01 = buff0[i + 1];
              s11 = buff1[i + 1];
              s0 = vis_faligndata(s00, s01);
              s1 = vis_faligndata(s10, s11);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d0 = vis_fpadd16(d10, d0);
              d1 = vis_fpadd16(d01, d1);
              d1 = vis_fpadd16(d11, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += 2 * m;
        }
        else if (jk_size == 3) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 3 * m;
        }
        else {                              /* jk_size == 4 */

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];
            k3 = pk[ik + 3 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;
            buff3 = buff[jk + 3] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {

#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];
                s3 = buff3[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);
                s3 = vis_faligndata(s30, s31);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 4 * m;
        }
      }

      /*****************************************
       *****************************************
       **          Final iteration            **
       *****************************************
       *****************************************/

      jk_size = n;

      if (jk_size >= 6)
        jk_size = 4;

      if (jk_size == 5)
        jk_size = 3;

      k0 = karr[ik_last];
      k1 = karr[ik_last + m];
      k2 = karr[ik_last + 2 * m];
      k3 = karr[ik_last + 3 * m];

      off = ik_last;
      doff = off / 8;
      off &= 7;
      buff0 = buff[0] + doff;
      buff1 = buff[1] + doff;
      buff2 = buff[2] + doff;
      buff3 = buff[3] + doff;
      vis_write_gsr(gsr_scale + off);

      if (jk_size == 1) {
        dp = buffe;

        s01 = buff0[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s01 = buff0[i + 1];
          s0 = vis_faligndata(s00, s01);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d1 = vis_fpadd16(d1, d01);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 2) {
        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 3) {

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else {                                /* if (jk_size == 4) */

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
        s31 = buff3[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s30 = s31;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s31 = buff3[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);
          s3 = vis_faligndata(s30, s31);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d0 = vis_fpadd16(d0, d30);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);
          d1 = vis_fpadd16(d1, d31);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }

      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);

      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= (n + 1))
        buff_ind = 0;
    }

    testchan <<= 1;
  }

  mlib_free(pbuff);

  if (buffs != buffs_local)
    mlib_free(buffs);

  return MLIB_SUCCESS;
}