void
mlib_v_ImageAffineTableLine_8nw_2_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_2x2(row00, row10);
		FILTER_MERGE;
		MAKE_2x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

#pragma pipeloop(0)
	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, fy0);
		xFilter = vis_write_lo(xFilter, fx0);

		row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1));
		row10 =
		    vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr,
		    srcStride + 1));

		v0 = vis_fmul8x16au(vis_read_lo(row00), fy0);
		v1 = vis_fmul8x16al(vis_read_lo(row10), fy0);
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
mlib_status
__mlib_VideoUpSample420_Nearest(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)src;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 sa, da;
	mlib_s32 emask, i;

	if (n <= 0)
		return (MLIB_FAILURE);

#pragma pipeloop(0)
	for (i = 0; i <= (n - 8); i += 8) {
		sa = *sp;
		*dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		*(dp0 + 1) = *(dp1 + 1) =
			vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
		sp++;
		dp0 += 2;
		dp1 += 2;
	}

	if (i < n) {
		sa = vis_ld_d64_nf(sp);
		da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		emask = vis_edge8(dp0, dend0);
		vis_pst_8(da, dp0, emask);
		vis_pst_8(da, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
			emask = vis_edge8(dp0, dend0);
			vis_pst_8(da, dp0, emask);
			vis_pst_8(da, dp1, emask);
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_f32 *pd = (mlib_f32 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp0, tmp1, data;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, bias = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);

#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {
		d0 = (*sp0++);
		tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0));
		tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0));

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16(data);
	}

	dst = (mlib_u8 *)pd;

	for (; i < n; i += 2) {
		(*dst++) = (src[i] + src[i + 1] + bias) >> 1;
/* 1=>2, 2=>1 */
		bias ^= 1;
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorSplit3_S16(
	mlib_s16 *color1,
	mlib_s16 *color2,
	mlib_s16 *color3,
	const mlib_s16 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 *dp2 = (mlib_d64 *)color3;
	mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3;
	mlib_s32 i;

	vis_write_gsr(4);
	vis_write_bmask(0x02CE13DF, 0);
#pragma pipeloop(0)
#pragma unroll(4)
	for (i = 0; i <= (n - 4); i += 4) {
		sd0 = sp[0];
		sd1 = sp[1];
		sd2 = sp[2];
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		(*dp0++) = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		(*dp1++) = vis_bshuffle(dd3, dd3);
		(*dp2++) = vis_bshuffle(dd1, dd2);
		sp += 3;
	}

/*
 * last 4 pixels
 */

	if (i < n) {
		mlib_s32 emask = 0xF0 >> (n & 3);
		mlib_d64 st0, st1, st2;

		sd0 = sp[0];
		sd1 = vis_ld_d64_nf(sp + 1);
		sd2 = vis_ld_d64_nf(sp + 2);
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		st0 = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		st1 = vis_bshuffle(dd3, dd3);
		st2 = vis_bshuffle(dd1, dd2);
		vis_pst_16(st0, dp0, emask);
		vis_pst_16(st1, dp1, emask);
		vis_pst_16(st2, dp2, emask);
	}
示例#5
0
mlib_status
__mlib_VideoAddBlock_U8_S16(
	mlib_u8 *curr_block,
	const mlib_s16 *mc_block,
	mlib_s32 stride)
{
	mlib_s32 y;
	mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd;
	mlib_f32 zeros = vis_fzeros();

/*
 *   mlib_s32 mlib_imult = 0x100;
 *   mlib_f32 mult  = *(mlib_f32*) & mlib_imult;
 */
	mlib_f32 mult = vis_to_float(0x100);

	vis_write_gsr(7 << 3);

	dp = (mlib_d64 *)curr_block;
	sp = (mlib_d64 *)mc_block;

#pragma pipeloop(0)
	for (y = 0; y < 8; y++) {

		dd = *dp;
		s1hi = (*sp++);
		s1lo = (*sp++);
		s2hi = vis_fpmerge(zeros, vis_read_hi(dd));
		s2lo = vis_fmul8x16al(vis_read_lo(dd), mult);

		s1hi = vis_fpadd16(s1hi, s2hi);
		s1lo = vis_fpadd16(s1lo, s2lo);

		*dp = vis_fpack16_pair(s1hi, s1lo);
		dp = (mlib_d64 *)((mlib_u8 *)dp + stride);
	}

	return (MLIB_SUCCESS);
}
void
__mlib_VideoColorYUV444int_to_UYVY422int(
	mlib_u32 *uyvy,
	const mlib_u8 *yuv,
	mlib_s32 w,
	mlib_s32 h,
	mlib_s32 dlb,
	mlib_s32 slb)
{
	mlib_s32 i, val_y0, val_y1, val_u0, val_v0, count, left;

	dlb >>= 2;
	w >>= 1;
	count = w >> 2;
	left = w - (count << 2);

	if (w == 0 || h == 0)
		return;

	vis_write_gsr(6 << 3);

	for (i = 0; i < h; i++, yuv += slb, uyvy += dlb) {
		if ((((mlib_addr)yuv | (mlib_addr)uyvy) & 7) == 0) {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_d64 w_ld0, w_ld1, w_ld2;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;

#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				w_ld0 = ((mlib_d64 *)yuv)[3 * j];
				w_ld1 = ((mlib_d64 *)yuv)[3 * j + 1];
				w_ld2 = ((mlib_d64 *)yuv)[3 * j + 2];

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)uyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)uyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				w_ld0 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count);
				w_ld1 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count + 1);
				w_ld2 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 *
					count + 2);

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)uyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}
		} else {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_d64 w_ld0, w_ld1, w_ld2;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;
			mlib_d64 *al_addr;
			mlib_d64 l0, l1, l2, l3;
			const mlib_u8 *pyuv = yuv;
			mlib_u32 *puyvy = uyvy;

			if ((mlib_addr)puyvy & 7) {
				val_y0 = yuv[0];
				val_y1 = yuv[3];
				val_u0 = (yuv[1] + yuv[4]) >> 1;
				val_v0 = (yuv[2] + yuv[5]) >> 1;
				puyvy[0] =
					(val_u0 << 24) | (val_y0 << 16) |
					(val_v0 << 8) | val_y1;
				pyuv += 6;
				puyvy++;
				count = (w - 1) >> 2;
				left = (w - 1) - (count << 2);
			} else {
				count = w >> 2;
				left = w - (count << 2);
			}

			al_addr = vis_alignaddr((void *)pyuv, 0);
			l0 = vis_ld_d64_nf(al_addr); al_addr++;
#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				l1 = (*al_addr++);
				l2 = (*al_addr++);
				l3 = vis_ld_d64_nf(al_addr); al_addr++;
				w_ld0 = vis_faligndata(l0, l1);
				w_ld1 = vis_faligndata(l1, l2);
				w_ld2 = vis_faligndata(l2, l3);
				l0 = l3;
				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)puyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)puyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				l1 = vis_ld_d64_nf(al_addr); al_addr++;
				l2 = vis_ld_d64_nf(al_addr); al_addr++;
				l3 = vis_ld_d64_nf(al_addr);
				w_ld0 = vis_faligndata(l0, l1);
				w_ld1 = vis_faligndata(l1, l2);
				w_ld2 = vis_faligndata(l2, l3);

				MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1,
					w_ld2);

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)puyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}

			count = w >> 2;
			left = w - (count << 2);
		}
示例#7
0
mlib_status
__mlib_VectorConvert_S32_S8_Mod(
	mlib_s32 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *psrc = (mlib_s8 *)x;
	mlib_s32 *pdst = (mlib_s32 *)z;
	mlib_f32 fone = vis_to_float(0x10001);
	mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done =
		vis_to_double_dup(0x1000100);
	mlib_s32 i = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	if ((mlib_addr)pdst & 7) {
		(*pdst++) = (*psrc++);
		i = 1;
	}

	dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0);
	dsrc = vis_ld_d64_nf(dpsrc);
	vis_write_bmask(0x00012223, 0);

	if ((mlib_addr)psrc & 7) {
		dsrc1 = vis_ld_d64_nf(dpsrc + 1);
		dsrc = vis_faligndata(dsrc, dsrc1);
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3));
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc0 = dsrc1;
			dsrc1 = vis_ld_d64_nf(dpsrc + 2);
			dsrc = vis_faligndata(dsrc0, dsrc1);

			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	} else {
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_bshuffle(dst3, dst3);
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc = vis_ld_d64_nf(dpsrc + 1);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	}

	for (; i < n; i++)
		(*pdst++) = (*psrc++);

	return (MLIB_SUCCESS);
}
示例#8
0
mlib_status
__mlib_VectorConvert_S32_U8_Mod(
	mlib_s32 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *psrc = (mlib_u8 *)x;
	mlib_s32 *pdst = (mlib_s32 *)z;
	mlib_f32 fzero = vis_fzero(), fone1 = vis_to_float(0x100), fone2 =
		vis_to_float(0x10001);
	mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3;
	mlib_s32 i = 0, off;

	if (n <= 0)
		return (MLIB_FAILURE);

	if ((mlib_addr)pdst & 7) {
		(*pdst++) = (*psrc++);
		i = 1;
	}

	dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0);
	dsrc = dpsrc[0];

	off = (mlib_addr)psrc & 7;

	if (off) {
		dsrc1 = dsrc;
		vis_alignaddr((void *)0, 7);
		vis_write_bmask(0x11111111 * off, 0x40516273);
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i <= (n - 8); i += 8) {
			dsrc0 = dsrc1;
			dsrc1 = vis_ld_d64_nf(dpsrc + 1);
			dsrc = vis_bshuffle(dsrc0, dsrc1);
			dst0 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2);
			dst1 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2);
			dsrc = vis_faligndata(dsrc, dsrc);
			dst2 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2);
			dst3 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	} else {
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fmul8x16al(vis_read_hi(dsrc), fone1);
			dst0 = vis_fpmerge(fzero, vis_read_hi(dst1));
			dst1 = vis_fpmerge(fzero, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst2 = vis_fmuld8ulx16(vis_read_hi(dst3), fone2);
			dst3 = vis_fmuld8ulx16(vis_read_lo(dst3), fone2);
			dsrc = vis_ld_d64_nf(dpsrc + 1);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	}

	for (; i < n; i++)
		(*pdst++) = (*psrc++);

	return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
示例#11
0
void ADD_SUFF(ByteGrayToIntArgbScaleConvert)(SCALE_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 d0, d1, d2, d3, dd;
    mlib_f32 ff, aa = vis_fones();
    mlib_s32 i, j, x;

    if (width < 16) {
        for (j = 0; j < height; j++) {
            mlib_u8  *src = srcBase;
            mlib_s32 *dst = dstBase;
            mlib_s32 tmpsxloc = sxloc;

            PTR_ADD(src, (syloc >> shift) * srcScan);

            for (i = 0; i < width; i++) {
                x = src[tmpsxloc >> shift];
                tmpsxloc += sxinc;
                dst[i] = Gray2Argb(x);
            }

            PTR_ADD(dstBase, dstScan);
            syloc += syinc;
        }
        return;
    }

    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
        mlib_u8  *src = srcBase;
        mlib_s32 *dst = dstBase;
        mlib_s32 *dst_end;
        mlib_s32 tmpsxloc = sxloc;

        PTR_ADD(src, (syloc >> shift) * srcScan);

        dst_end = dst + width;

#pragma pipeloop(0)
        for (; dst <= (dst_end - 4); dst += 4) {
            LOAD_NEXT_U8(dd, src + ((tmpsxloc + 3*sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc + 2*sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc +   sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc          ) >> shift));
            tmpsxloc += 4*sxinc;
            ff = vis_read_hi(dd);
            d0 = vis_fpmerge(aa, ff);
            d1 = vis_fpmerge(ff, ff);
            d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1));
            d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1));
            ((mlib_f32*)dst)[0] = vis_read_hi(d2);
            ((mlib_f32*)dst)[1] = vis_read_lo(d2);
            ((mlib_f32*)dst)[2] = vis_read_hi(d3);
            ((mlib_f32*)dst)[3] = vis_read_lo(d3);
        }

        while (dst < dst_end) {
            x = src[tmpsxloc >> shift];
            tmpsxloc += sxinc;
            *dst++ = Gray2Argb(x);
        }

        PTR_ADD(dstBase, dstScan);
        syloc += syinc;
    }
}
static mlib_status
mlib_v_VideoColorYUV2RGB420_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp11, *sp12, *sl11, *sl12;

/* pointers to dst address */
	mlib_u8 *dp1, *dl1;

/* pointers to dst address */
	mlib_u8 *dp2, *dl2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variables */
	mlib_s32 i, j;
	mlib_s32 y_stride2 = 2 * y_stride;
	mlib_s32 rgb_stride2 = 2 * rgb_stride;
	mlib_s32 off2, off3;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;
	mlib_d64 *buf1, BUFF1[16 * 1024];
	mlib_d64 *buf2, BUFF2[16 * 1024];
	mlib_u8 *tmp1, *tmp2;

	if (width * 3 > 16 * 1024) {
		tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
		buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7);
		buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7);
	} else {
		buf1 = (mlib_d64 *)BUFF1;
		buf2 = (mlib_d64 *)BUFF2;
	}

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp11 = sl11 = (mlib_u8 *)y;
	sp12 = sl12 = (mlib_u8 *)y + y_stride;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dp1 = (mlib_u8 *)buf1;
	dp2 = (mlib_u8 *)buf2;
	dl1 = (mlib_u8 *)rgb;
	dl2 = (mlib_u8 *)(rgb + rgb_stride);
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

/*
 * row loop
 */
	for (j = 0; j < height / 2; j++) {
		spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0);
		spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0);

		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		vis_alignaddr((void *)off2, 0);
		fu0 = (*dfu++);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = (*dfv++);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy0 = (*spy1++);
		dy4 = (*spy2++);

		dy3 = vis_ld_d64_nf(spy1); spy1++;
		vis_alignaddr(sp11, 0);
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

		dy5 = vis_ld_d64_nf(spy2); spy2++;
		vis_alignaddr(sp12, 0);
		dy2 = vis_faligndata(dy4, dy5);
		dy4 = dy5;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		vis_alignaddr((void *)off2, 0);
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		vis_alignaddr((void *)off3, 0);
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd01 = vis_bshuffle(red1, green1);
			dd02 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x03B04C05, 0);
			dd11 = vis_bshuffle(red1, green1);
			dd12 = vis_bshuffle(red2, green2);
			vis_write_bmask(0xD06E07F0, 0);
			dd21 = vis_bshuffle(red1, green1);
			dd22 = vis_bshuffle(red2, green2);
			vis_write_bmask(0x01834967, 0);
			ddp1[0] = vis_bshuffle(dd01, blue1);
			ddp2[0] = vis_bshuffle(dd02, blue2);
			vis_write_bmask(0xA12B45C7, 0);
			ddp1[1] = vis_bshuffle(dd11, blue1);
			ddp2[1] = vis_bshuffle(dd12, blue2);
			vis_write_bmask(0x0D23E56F, 0);
			ddp1[2] = vis_bshuffle(dd21, blue1);
			ddp2[2] = vis_bshuffle(dd22, blue2);

			dy3 = vis_ld_d64_nf(spy1); spy1++;
			vis_alignaddr(sp11, 0);
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			dy5 = vis_ld_d64_nf(spy2); spy2++;
			vis_alignaddr(sp12, 0);
			dy2 = vis_faligndata(dy4, dy5);
			dy4 = dy5;

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* Z*1.1644 */
			z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* Z*1.1644 */
			z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green1 = vis_fpack16_to_hi(green1, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			red1 = vis_fpack16_to_hi(red1, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
			vis_alignaddr((void *)off3, 0);
			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			green1 = vis_fpack16_to_lo(green1, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
			red1 = vis_fpack16_to_lo(red1, temp_r_lo);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

			green2 = vis_fpack16_to_hi(green2, temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

			blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

			red2 = vis_fpack16_to_hi(red2, temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

			green2 = vis_fpack16_to_lo(green2, temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

			blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
			red2 = vis_fpack16_to_lo(red2, temp_r_lo);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

			ddp1 += 3;
			ddp2 += 3;
		}

		dp1 = (mlib_u8 *)ddp1;
		dp2 = (mlib_u8 *)ddp2;

		vis_alignaddr((void *)(width - i), 0);
		blue1 = vis_faligndata(blue1, blue1);
		green1 = vis_faligndata(green1, green1);
		red1 = vis_faligndata(red1, red1);
		dp1 += ((width - i - 1) * 3);

		blue2 = vis_faligndata(blue2, blue2);
		green2 = vis_faligndata(green2, green2);
		red2 = vis_faligndata(red2, red2);
		dp2 += ((width - i - 1) * 3);

		vis_alignaddr((void *)7, 0);
		for (; i < width; i++) {
			STORE_PIXEL1(0, 1, 2);
			STORE_PIXEL2(0, 1, 2);
			dp1 -= 3;
			dp2 -= 3;
		}

		sp11 = sl11 = sl11 + y_stride2;
		sp12 = sl12 = sl12 + y_stride2;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;
		__mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3);
		__mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3);

		dl1 = dp1 = dl1 + rgb_stride2;
		dl2 = dp2 = dl2 + rgb_stride2;
		dp1 = (mlib_u8 *)buf1;
		dp2 = (mlib_u8 *)buf2;
		ddp1 = (mlib_d64 *)dp1;
		ddp2 = (mlib_d64 *)dp2;
	}

	if (width * 3 > 16 * 1024) {
		__mlib_free(tmp1);
		__mlib_free(tmp2);
	}
	return (MLIB_SUCCESS);
}
示例#13
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 fzero = vis_fzeros();
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_u8 c;
	mlib_d64 dsp = vis_to_double_dup(0x800080);
	mlib_d64 rst = vis_to_double_dup(0x80808080);
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d4 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			d2 = vis_fpmerge(fzero, vis_read_hi(d4));
			d3 = vis_fmul8x16al(vis_read_lo(d4), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
			(*ddst++) = vis_fxor(d4, rst);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d1 = vis_faligndata(d1, d2);
			d3 = vis_fmul8x16al(vis_read_hi(d1), fm);
			d4 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpadd16(dsp, d4);
			d1 = vis_fpack16_pair(d3, d4);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_faligndata(d1, d2);
			d4 = vis_fmul8x16al(vis_read_hi(d3), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d3), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d3 = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8x16al(vis_read_hi(d6), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d6), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d6 = vis_fpack16_pair(d4, d5);
			(*ddst++) = vis_fxor(d3, rst);
			(*ddst++) = vis_fxor(d6, rst);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c;

	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
示例#15
0
mlib_status
__mlib_VideoUpSample420(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src0,
	const mlib_u8 *src1,
	const mlib_u8 *src2,
	mlib_s32 n)
{
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d10, d11, d20, d21;
	mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo;
	mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo;
	mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo;
	mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo;
	mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
	mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7;
	mlib_d64 data0, data1, data2, data3, tmp0, tmp1;
	mlib_f32 fone = vis_to_float(0x4000000);
	mlib_f32 fthree = vis_to_float(0xC000000);
	mlib_f32 fone1 = vis_to_float(0x40404040);
	mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0);
	mlib_d64 dseven = vis_to_double_dup(0x70007);
	mlib_d64 deight = vis_to_double_dup(0x80008);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((3 << 3) + 2);

	d00 = vis_ld_d64_nf(sp0);
	d10 = vis_ld_d64_nf(sp1);
	d20 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone);
	lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone);
	lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone);
	lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone);
	tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree);
	tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree);
	lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0);
	lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1);
	lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0);
	lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1);

#pragma pipeloop(0)
	for (i = 0; i < n - 8; i += 8) {
		d01 = *sp0;
		d11 = *sp1;
		d21 = *sp2;
		sp0++;
		sp1++;
		sp2++;

		thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone);
		thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone);
		thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone);
		thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone);

		tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree);
		tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree);

		thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0);
		thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1);
		thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0);
		thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1);

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, thiscolsum0_hi);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, thiscolsum1_hi);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0));
		dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));
		dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2));

		dp0 += 2;
		dp1 += 2;
		lastcolsum0_hi = thiscolsum0_hi;
		lastcolsum0_lo = thiscolsum0_lo;
		lastcolsum1_hi = thiscolsum1_hi;
		lastcolsum1_lo = thiscolsum1_lo;
	}

	if (i < n) {

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, lastcolsum0_lo);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, lastcolsum1_lo);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));

		emask = vis_edge8(dp0, dend0);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			acc0 = vis_fpmerge(vis_read_lo(data1),
				vis_read_lo(data0));
			acc1 = vis_fpmerge(vis_read_lo(data3),
				vis_read_lo(data2));

			emask = vis_edge8(dp0, dend0);
			vis_pst_8(acc0, dp0, emask);
			vis_pst_8(acc1, dp1, emask);
		}
	}

	vis_write_gsr(7);

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;

	ac0 = *dp0;
	ac2 = *dp1;

#pragma pipeloop(0)
	for (i = 0; i < 2 * n - 8; i += 8) {
		ac1 = *dp0;
		ac3 = *dp1;
		*dp0 = vis_faligndata(ac0, ac1);
		*dp1 = vis_faligndata(ac2, ac3);
		dp0++;
		dp1++;
		ac0 = ac1;
		ac2 = ac3;
	}

	if (i < 2 * n) {
		ac1 = vis_ld_d64_nf(dp0);
		ac3 = vis_ld_d64_nf(dp1);
		emask = vis_edge8(dp0, dend0);
		acc0 = vis_faligndata(ac0, ac1);
		acc1 = vis_faligndata(ac2, ac3);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2ABGR411_dst_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy0, dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 dd, dd0, dd1, dtmp;

/* used to load u, v into mlib_f32 */
	mlib_f32 ffu[1], ffv[1];

/* used to load u, v into mlib_f32 */
	mlib_u8 *ufu, *vfu;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_d64 *buf;
	mlib_s32 inc;

	ufu = (mlib_u8 *)ffu;
	vfu = (mlib_u8 *)ffv;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(3 << 3);

	buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64));

	if (buf == NULL)
		return (MLIB_FAILURE);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = buf;
		dy0 = vis_ld_d64_nf(spy); spy++;

#pragma pipeloop(0)
		for (i = 0; i < width; i += 8) {
			dy1 = vis_ld_d64_nf(spy); spy++;
			(*dpp++) = vis_faligndata(dy0, dy1);
			dy0 = dy1;
		}

		spy = buf;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);

		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		vis_alignaddr((void *)(8 - i), 0);
		inc = (emask1 != 0xff);
		emask1 &= emask;

		ufu[0] = vis_ld_u8_nf(sp2);
		ufu[1] = vis_ld_u8_nf(sp2 + 1);
		ufu[2] = vis_ld_u8_nf(sp2 + 2);
		ufu[3] = vis_ld_u8_nf(sp2 + 3);
		vfu[0] = vis_ld_u8_nf(sp3);
		vfu[1] = vis_ld_u8_nf(sp3 + 1);
		vfu[2] = vis_ld_u8_nf(sp3 + 2);
		vfu[3] = vis_ld_u8_nf(sp3 + 3);
		sp2 += 4;
		sp3 += 4;

		fu = ffu[0];
		fv = ffv[0];

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 16; i += 16) {

			dy1 = (*spy++);
			dy2 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = vis_ld_u8_nf(sp2);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);
			ufu[1] = vis_ld_u8_nf(sp2 + 1);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			ufu[2] = vis_ld_u8_nf(sp2 + 2);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);
			ufu[3] = vis_ld_u8_nf(sp2 + 3);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);
			vfu[0] = vis_ld_u8_nf(sp3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);
			vfu[1] = vis_ld_u8_nf(sp3 + 1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);
			vfu[2] = vis_ld_u8_nf(sp3 + 2);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);
			vfu[3] = vis_ld_u8_nf(sp3 + 3);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
			fu = ffu[0];
			fv = ffv[0];
			sp2 += 4;
			sp3 += 4;
			emask1 = emask;
		}

		if (i <= width - 8) {

			dy1 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = ufu[2];

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			vfu[0] = vfu[2];

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr = vis_fpack16_pair(dr1, dr2);
			dg = vis_fpack16_pair(dg1, dg2);
			db = vis_fpack16_pair(db1, db2);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			fu = ffu[0];
			fv = ffv[0];

			i += 8;
			emask1 = emask;
		}

		if (i < width) {

			dy1 = vis_ld_d64_nf(spy);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			fu = vis_fpack16(db1);

			dg2 = vis_fpmerge(fu, vis_fpack16(dg1));
			dg3 = vis_fpmerge(fu, vis_fpack16(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
		}

		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}
	__mlib_free(buf);
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB420_Nearest(
	mlib_u8 *rgb0,
	mlib_u8 *rgb1,
	const mlib_u8 *y0,
	const mlib_u8 *y1,
	const mlib_u8 *cb,
	const mlib_u8 *cr,
	mlib_s32 n)
{
/* pointers to dst address */
	mlib_u8 *dp1, *dp2;

/* all. pointer to y */
	mlib_d64 *spy1, *spy2;

/* all. pointers to u, v */
	mlib_f32 *dfu, *dfv;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 du, dv;

/* (1.00000, 1.40200)*8192 */
	mlib_f32 k12 = vis_to_float(0x20002cdd);

/* (-.34414, -.71414)*8192 */
	mlib_f32 k34 = vis_to_float(0xf4fde926);

/* 1.77200*8192 */
	mlib_f32 k5 = vis_to_float(0x10038b4);

/* (179.45600 - 0.5)*32 */
	mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f);

/* (135.45984 + 0.5)*32 */
	mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff);

/* (226.81600 - 0.5)*32 */
	mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 z_11644_hi, z_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
/* loop variable */
	mlib_s32 i;
	mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21;
	mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22;

	if (n <= 0)
		return (MLIB_FAILURE);

/*
 * initialize GSR scale factor
 */
	vis_write_gsr((2 << 3) + 7);

	dp1 = (mlib_u8 *)rgb0;
	dp2 = (mlib_u8 *)rgb1;
	ddp1 = (mlib_d64 *)dp1;
	ddp2 = (mlib_d64 *)dp2;

	spy1 = (mlib_d64 *)y0;
	spy2 = (mlib_d64 *)y1;
	dfu = (mlib_f32 *)cb;
	dfv = (mlib_f32 *)cr;

	fu = vis_ld_f32_nf(dfu);
	dfu++;
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	du = vis_fpmerge(fu, fu);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/* U*2.0184 */
	u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
	g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

	u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
	g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
	v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
	g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

	v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
	g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
	y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
	b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
	y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
	b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
	z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
	r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
	z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
	r_lo = vis_fpsub16(v_15966_lo, k_179_456);

	temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
	temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

	green1 = vis_fpack16_to_hi(green1, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

	blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
	fu = vis_ld_f32_nf(dfu);
	dfu++;

	red1 = vis_fpack16_to_hi(red1, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
	fv = vis_ld_f32_nf(dfv);
	dfv++;

	green1 = vis_fpack16_to_lo(green1, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

	blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
	du = vis_fpmerge(fu, fu);

	red1 = vis_fpack16_to_lo(red1, temp_r_lo);
	dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
	u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
	temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
	v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
	temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

	green2 = vis_fpack16_to_hi(green2, temp_g_hi);
	temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

	blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
	temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

	red2 = vis_fpack16_to_hi(red2, temp_r_hi);
	temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

	green2 = vis_fpack16_to_lo(green2, temp_g_lo);
	temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

	blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
	red2 = vis_fpack16_to_lo(red2, temp_r_lo);

	u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
	v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

	dy1 = vis_ld_d64_nf(spy1);
	spy1++;
	dy2 = vis_ld_d64_nf(spy2);
	spy2++;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {

		vis_write_bmask(0x0801902A, 0);
		dd01 = vis_bshuffle(red1, green1);
		dd02 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x03B04C05, 0);
		dd11 = vis_bshuffle(red1, green1);
		dd12 = vis_bshuffle(red2, green2);
		vis_write_bmask(0xD06E07F0, 0);
		dd21 = vis_bshuffle(red1, green1);
		dd22 = vis_bshuffle(red2, green2);
		vis_write_bmask(0x01834967, 0);
		ddp1[0] = vis_bshuffle(dd01, blue1);
		ddp2[0] = vis_bshuffle(dd02, blue2);
		vis_write_bmask(0xA12B45C7, 0);
		ddp1[1] = vis_bshuffle(dd11, blue1);
		ddp2[1] = vis_bshuffle(dd12, blue2);
		vis_write_bmask(0x0D23E56F, 0);
		ddp1[2] = vis_bshuffle(dd21, blue1);
		ddp2[2] = vis_bshuffle(dd22, blue2);

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_45984);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_45984);

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_226_816);

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_226_816);

/* Z*1.1644 */
		z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12);
		r_hi = vis_fpsub16(v_15966_hi, k_179_456);

/* Z*1.1644 */
		z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12);
		r_lo = vis_fpsub16(v_15966_lo, k_179_456);

		temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
		temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

		green1 = vis_fpack16_to_hi(green1, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

		blue1 = vis_fpack16_to_hi(blue1, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);
		fu = vis_ld_f32_nf(dfu);
		dfu++;

		red1 = vis_fpack16_to_hi(red1, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);
		fv = vis_ld_f32_nf(dfv);
		dfv++;

		green1 = vis_fpack16_to_lo(green1, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

		blue1 = vis_fpack16_to_lo(blue1, temp_b_lo);
		du = vis_fpmerge(fu, fu);

		red1 = vis_fpack16_to_lo(red1, temp_r_lo);
		dv = vis_fpmerge(fv, fv);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		temp_g_hi = vis_fpadd16(g_hi, z_11644_hi);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		temp_b_hi = vis_fpadd16(b_hi, z_11644_hi);

		green2 = vis_fpack16_to_hi(green2, temp_g_hi);
		temp_r_hi = vis_fpadd16(r_hi, z_11644_hi);

		blue2 = vis_fpack16_to_hi(blue2, temp_b_hi);
		temp_g_lo = vis_fpadd16(g_lo, z_11644_lo);

		red2 = vis_fpack16_to_hi(red2, temp_r_hi);
		temp_b_lo = vis_fpadd16(b_lo, z_11644_lo);

		green2 = vis_fpack16_to_lo(green2, temp_g_lo);
		temp_r_lo = vis_fpadd16(r_lo, z_11644_lo);

		blue2 = vis_fpack16_to_lo(blue2, temp_b_lo);
		red2 = vis_fpack16_to_lo(red2, temp_r_lo);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		dy1 = vis_ld_d64_nf(spy1);
		spy1++;
		dy2 = vis_ld_d64_nf(spy2);
		spy2++;

		ddp1 += 3;
		ddp2 += 3;
	}

	dp1 = (mlib_u8 *)ddp1;
	dp2 = (mlib_u8 *)ddp2;

	vis_alignaddr((void *)(n - i), 0);
	blue1 = vis_faligndata(blue1, blue1);
	green1 = vis_faligndata(green1, green1);
	red1 = vis_faligndata(red1, red1);
	dp1 += ((n - i - 1) * 3);

	blue2 = vis_faligndata(blue2, blue2);
	green2 = vis_faligndata(green2, green2);
	red2 = vis_faligndata(red2, red2);
	dp2 += ((n - i - 1) * 3);

	vis_alignaddr((void *)7, 0);
	for (; i < n; i++) {
		STORE_PIXEL1(0, 1, 2);
		STORE_PIXEL2(0, 1, 2);
		dp1 -= 3;
		dp2 -= 3;
	}

	return (MLIB_SUCCESS);
}
示例#18
0
mlib_status
__mlib_VectorConvert_U8_S8_Sat(
	mlib_u8 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i, off;
	mlib_s8 c;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 zero = vis_fzeros();

	if (length < 16) {
		PACK_S_U(mlib_s8, mlib_u8);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) < 0 ? 0 : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		off = (mlib_addr)src & 7;
		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}

/*
 * Then loop with step==2.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8sux16(d6, four_16_ones);
			d6 = vis_faligndata(d6, d6);
			d5 = vis_fmul8sux16(d6, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c;

	return (MLIB_SUCCESS);
}
示例#19
0
void ADD_SUFF(ByteGrayToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 d0, d1, d2, d3;
    mlib_f32 ff, aa = vis_fones();
    mlib_s32 i, j, x;

    if (width < 8) {
        for (j = 0; j < height; j++) {
            mlib_u8  *src = srcBase;
            mlib_s32 *dst = dstBase;

            for (i = 0; i < width; i++) {
                x = src[i];
                dst[i] = Gray2Argb(x);
            }

            PTR_ADD(dstBase, dstScan);
            PTR_ADD(srcBase, srcScan);
        }
        return;
    }

    if (srcScan == width && dstScan == 4*width) {
        width *= height;
        height = 1;
    }

    for (j = 0; j < height; j++) {
        mlib_u8  *src = srcBase;
        mlib_s32 *dst = dstBase;
        mlib_s32 *dst_end;

        dst_end = dst + width;

        while (((mlib_s32)src & 3) && dst < dst_end) {
            x = *src++;
            *dst++ = Gray2Argb(x);
        }

#pragma pipeloop(0)
        for (; dst <= (dst_end - 4); dst += 4) {
            ff = *(mlib_f32*)src;
            d0 = vis_fpmerge(aa, ff);
            d1 = vis_fpmerge(ff, ff);
            d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1));
            d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1));
            ((mlib_f32*)dst)[0] = vis_read_hi(d2);
            ((mlib_f32*)dst)[1] = vis_read_lo(d2);
            ((mlib_f32*)dst)[2] = vis_read_hi(d3);
            ((mlib_f32*)dst)[3] = vis_read_lo(d3);
            src += 4;
        }

        while (dst < dst_end) {
            x = *src++;
            *dst++ = Gray2Argb(x);
        }

        PTR_ADD(dstBase, dstScan);
        PTR_ADD(srcBase, srcScan);
    }
}
示例#20
0
mlib_status
__mlib_VectorConvert_S8_S16_Sat(
	mlib_s8 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6, d7;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s16 c;

	if (n < 16) {
		PACK_S_S(mlib_s16, mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_s32)dst & 7) {
		(*dst++) = (c =
			(*src++)) < MLIB_S8_MIN ? MLIB_S8_MIN : (c >
			MLIB_S8_MAX ? MLIB_S8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr64(((mlib_u64)0x082A4C6E << 32) | (8 << 3) | 2);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

		dsrc = (mlib_d64 *)src;

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
		}
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d1, d2);

			d3 = vis_fpackfix_pair(d4, d5);
			d4 = vis_fpack32(d4, d4);
			d4 = vis_fpack32(d4, d4);
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5));
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d4 = vis_fpackfix_pair(d4, d5);
			(*ddst++) = vis_bshuffle(d3, d4);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = (*dsrc++);
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d5 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d6 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d7 = vis_faligndata(d1, d2);

			d3 = vis_fpackfix_pair(d4, d5);
			d4 = vis_fpack32(d4, d4);
			d4 = vis_fpack32(d4, d4);
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5));
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d4 = vis_fpackfix_pair(d4, d5);

			d5 = vis_fpackfix_pair(d6, d7);
			d6 = vis_fpack32(d6, d6);
			d6 = vis_fpack32(d6, d6);
			d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7));
			d7 = vis_fpmerge(vis_read_lo(d7), vis_read_hi(d7));
			d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7));
			d6 = vis_fpackfix_pair(d6, d7);

			(*ddst++) = vis_bshuffle(d3, d4);
			(*ddst++) = vis_bshuffle(d5, d6);
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c < MLIB_S8_MIN ? MLIB_S8_MIN
			: (c > MLIB_S8_MAX ? MLIB_S8_MAX : c);
	}

	return (MLIB_SUCCESS);
}
示例#21
0
void ADD_SUFF(UshortGrayToByteGrayConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_u8  *dst_end;
    mlib_d64 s0, s1, ss;
    mlib_s32 i, j;

    if (width <= 8) {
        for (j = 0; j < height; j++) {
            mlib_u8 *src = srcBase;
            mlib_u8 *dst = dstBase;

            for (i = 0; i < width; i++) {
                dst[i] = src[2*i];
            }

            PTR_ADD(dstBase, dstScan);
            PTR_ADD(srcBase, srcScan);
        }
        return;
    }

    if (srcScan == 2*width && dstScan == width) {
        width *= height;
        height = 1;
    }

    for (j = 0; j < height; j++) {
        mlib_u8 *src = srcBase;
        mlib_u8 *dst = dstBase;
        mlib_d64 *sp;

        dst_end = dst + width;

        while (((mlib_s32)dst & 3) && dst < dst_end) {
            *dst++ = *src;
            src += 2;
        }

        if ((mlib_s32)src & 7) {
            sp = vis_alignaddr(src, 0);
            s1 = *sp++;

#pragma pipeloop(0)
            for (; dst <= (dst_end - 4); dst += 4) {
                s0 = s1;
                s1 = *sp++;
                ss = vis_faligndata(s0, s1);
                ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss));
                ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss));
                *(mlib_f32*)dst = vis_read_hi(ss);
                src += 2*4;
            }
        } else {
#pragma pipeloop(0)
            for (; dst <= (dst_end - 4); dst += 4) {
                ss = *(mlib_d64*)src;
                ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss));
                ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss));
                *(mlib_f32*)dst = vis_read_hi(ss);
                src += 2*4;
            }
        }

        while (dst < dst_end) {
            *dst++ = *src;
            src += 2;
        }

        PTR_ADD(dstBase, dstScan);
        PTR_ADD(srcBase, srcScan);
    }
}
示例#22
0
mlib_status
__mlib_VectorConvert_U8_S32_Sat(
	mlib_u8 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	mlib_s32 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d0, d_tmp, d1, d2, d3, d4;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s32 c;

	if (n < 8) {
		PACK_S_S(mlib_s32, mlib_u8, MLIB_U8_MAX, 0);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c =
			(*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;

	vis_write_gsr(23 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
#pragma unroll(4)
		for (i = 0; i < len_64; i++) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = (*dsrc++);
			d1 = vis_fpack32(d1, d1);
			d2 = vis_fpack32(d1, d2);
			d3 = vis_fpack32(d2, d3);
			d4 = vis_fpack32(d3, d4);
			(*ddst++) =
				vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4));
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d0 = (*dsrc++);

#pragma pipeloop(0)
#pragma unroll(4)
		for (i = 0; i < len_64; i++) {
			d_tmp = (*dsrc++);
			d1 = vis_faligndata(d0, d_tmp);
			d0 = (*dsrc++);
			d2 = vis_faligndata(d_tmp, d0);
			d_tmp = (*dsrc++);
			d3 = vis_faligndata(d0, d_tmp);
			d0 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d_tmp, d0);
			d1 = vis_fpack32(d1, d1);
			d2 = vis_fpack32(d1, d2);
			d3 = vis_fpack32(d2, d3);
			d4 = vis_fpack32(d3, d4);
			(*ddst++) =
				vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4));
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c < MLIB_U8_MIN ? MLIB_U8_MIN
			: (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
static void
mlib_v_VideoYUV2ABGR_aarray_411(
	mlib_u32 *abgr,
	const mlib_d64 *y,
	const mlib_f32 *u,
	const mlib_f32 *v,
	const mlib_d64 *a_array,
	mlib_s32 count,
	mlib_s32 left,
	mlib_s32 isrgb)
{
/* all. pointer to dst */
	mlib_d64 *dpp = (mlib_d64 *)abgr;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 *dpa, da0, da1, da2, da3, da4;
	mlib_d64 dtmp;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i;

	if (isrgb) {
		f0 = vis_to_float(0x12a1);
		f1 = vis_to_float(0x3317);
		f4 = vis_to_float(0xe5fa);
		f5 = vis_to_float(0xf375);
		f8 = vis_to_float(0x4097);
		doff0 = vis_to_double_dup(0xe420e420);
		doff1 = vis_to_double_dup(0x10f410f4);
		doff2 = vis_to_double_dup(0xdd60dd60);
	}

	dpa = vis_alignaddr((void *)a_array, 0);

	dy1 = (*y++);
	dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++;
	fu = (*u++);
	fv = (*v++);
	da2 = (*dpa++);
	da3 = vis_ld_d64_nf(dpa); dpa++;
	da4 = vis_ld_d64_nf(dpa); dpa++;

	du0 = vis_fmul8x16al(fu, f1);
	du1 = vis_fmul8x16al(fu, f4);
	dv1 = vis_fmul8x16al(fv, f5);
	dv2 = vis_fmul8x16al(fv, f8);

	if (!((mlib_addr)abgr & 7)) {
#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dpp[8 * i] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 1] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dpp[8 * i + 2] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 3] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dpp[8 * i + 4] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 5] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dpp[8 * i + 6] =
				vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dpp[8 * i + 7] =
				vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	} else {
		mlib_d64 dd;

#pragma pipeloop(0)
		for (i = 0; i < count; i++) {
			da0 = vis_faligndata(da2, da3);
			da1 = vis_faligndata(da3, da4);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

			db = vis_fpadd16(du0, doff0);

			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dr = vis_fpadd16(dv2, doff2);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i);
			dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1);
			fu = vis_ld_f32_nf((mlib_f32 *)u + i);
			fv = vis_ld_f32_nf((mlib_f32 *)v + i);
			da2 = da4;
			da3 = vis_ld_d64_nf(dpa + 2 * i);
			da4 = vis_ld_d64_nf(dpa + 2 * i + 1);

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd);

			dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd);
			dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd);
			((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd);

			du0 = vis_fmul8x16al(fu, f1);
			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dv2 = vis_fmul8x16al(fv, f8);
		}
	}

	if (left) {
		mlib_d64 res_buf[8];

		da0 = vis_faligndata(da2, da3);
		da1 = vis_faligndata(da3, da4);

		ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
		ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

		ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
		ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

		db = vis_fpadd16(du0, doff0);

		dtmp = vis_fpadd16(du1, dv1);
		dg = vis_fpadd16(dtmp, doff1);

		dr = vis_fpadd16(dv2, doff2);

		db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
		db1 = vis_fpadd16(ddy1, db1);

		db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
		db2 = vis_fpadd16(ddy2, db2);

		db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
		db3 = vis_fpadd16(ddy3, db3);

		db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
		db4 = vis_fpadd16(ddy4, db4);

		dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
		dg1 = vis_fpadd16(ddy1, dg1);

		dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
		dg2 = vis_fpadd16(ddy2, dg2);

		dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
		dg3 = vis_fpadd16(ddy3, dg3);

		dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
		dg4 = vis_fpadd16(ddy4, dg4);

		dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
		dr1 = vis_fpadd16(ddy1, dr1);

		dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
		dr2 = vis_fpadd16(ddy2, dr2);

		dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
		dr3 = vis_fpadd16(ddy3, dr3);

		dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
		dr4 = vis_fpadd16(ddy4, dr4);

		dr = vis_fpack16_pair(dr1, dr2);
		dr1 = vis_fpack16_pair(dr3, dr4);

		dg = vis_fpack16_pair(dg1, dg2);
		dg1 = vis_fpack16_pair(dg3, dg4);

		db = vis_fpack16_pair(db1, db2);
		db1 = vis_fpack16_pair(db3, db4);

		dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg));
		dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

		res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg));
		dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

		res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1));
		dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

		res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1));
		dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

		res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
		res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));

		for (i = 0; i < left; i++)
			((mlib_f32 *)dpp)[16 * count + i] =
				((mlib_f32 *)res_buf)[i];
	}
}
示例#25
0
mlib_status
__mlib_VectorConvert_S16_U8_Mod(
	mlib_s16 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_u8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_s32 len_64, even_length, rest_64, length = n;
	mlib_f32 fzero = vis_fzeros();
	mlib_d64 dd1, dd2, dd3, dd4;
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		EXPAND(mlib_u8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];

	if (!((mlib_addr)src & 7)) {

/*
 * Both vectors are 64-bit aligned. We can process without
 * vis_faligndata
 * Peeling the 1 iteration. Then loop with step==2.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
		}
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2));
		}
	} else {

/*
 * Source vector is not 64-bit aligned. Use vis_faligndata.
 * Peeling the 1 iteration. Then loop with step==2.
 */

		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd3 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
		}
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i <= len_64; i += 2) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i);
			dd3 = vis_faligndata(dd1, dd2);
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4));
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
示例#26
0
static mlib_status
mlib_MatrixMul_S8xS8(
	void *z,
	const STYPE * x,
	const STYPE * y,
	mlib_s32 m,
	mlib_s32 l,
	mlib_s32 n,
	mlib_s32 dst_type)
{
	mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y;
	mlib_d64 array[MAX_SIZE];
	mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3;
	mlib_s32 size, i, j, k, l4;
	mlib_s32 vmin, vmax;

	if (!((m > 0) && (l > 0) && (n > 0))) {
		return (MLIB_FAILURE);
	}

	if (!dst_type) {
		vmin = MLIB_S8_MIN;
		vmax = MLIB_S8_MAX;
	} else {	/* if (dst_type == 1) */

		vmin = MLIB_S16_MIN;
		vmax = MLIB_S16_MAX;
	}

	l4 = (l + 3) / 4;
	size = l4 * n + l4 + 3;

	if (size <= MAX_SIZE) {
		buff_y = array;
	} else {
		buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64));

		if (buff_y == NULL) {
			mlib_s32 type_z, mode;

			if (!dst_type) {
				type_z = type_S8;
				mode = mode_Sat;
			} else if (dst_type == 1) {
				type_z = type_S16;
				mode = mode_Sat;
			} else {	/* if (dst_type == 2) */

				type_z = type_S16;
				mode = mode_Mod;
			}

			return mlib_MatrixMul_type(type_S8, type_z, mode, x, y,
				m, l, n, n, z);
		}
	}

	buff_x = buff_y + l4 * n;
	pbuff_y = buff_y;

/* transpose y matrix */
	for (i = 0; i < n; i++) {
		STYPE *py = (STYPE *) y + i;
		mlib_s16 *pp = (mlib_s16 *)pbuff_y;

		for (j = 0; j <= (l - 4); j += 4) {
			((mlib_s32 *)pp)[0] = (py[0] << 16) | (py[n] & 0xFFFF);
			((mlib_s32 *)pp)[1] =
				(py[2 * n] << 16) | (py[3 * n] & 0xFFFF);
			py += 4 * n;
			pp += 4;
		}

		for (; j < l; j++) {
			(*pp++) = *py;
			py += n;
		}

		for (; j < 4 * l4; j++) {
			(*pp++) = 0;
		}

		pbuff_y += l4;
	}

	for (j = 0; j < m; j++) {
		pbuff_x = buff_x;
		pbuff_y = buff_y;

/* copy x line */
		px = vis_alignaddr((void *)x, j * l);
		x1 = vis_ld_d64_nf(px);
		px++;
		for (i = 0; i < (l + 7) / 8; i++) {
			x0 = x1;
			x1 = vis_ld_d64_nf(px);
			px++;
			xx = vis_faligndata(x0, x1);
			pbuff_x[2 * i] =
				vis_fpmerge(vis_read_hi(xx), vis_read_hi(xx));
			pbuff_x[2 * i + 1] =
				vis_fpmerge(vis_read_lo(xx), vis_read_lo(xx));
		}

/* loop on y lines */
		for (i = 0; i < n; i += 2) {
			mlib_d64 *px = pbuff_x;
			mlib_d64 *py0 = pbuff_y;
			mlib_d64 *py1 = (i + 1 < n) ? (py0 + l4) : py0;
			mlib_s32 s0, s1;

			ds0 = ds1 = vis_fzero();

			LOAD;
			MUL;
			LOAD;

#pragma pipeloop(0)
			for (k = 0; k < l4; k++) {
				SUM;
				MUL;
				LOAD;
			}

			s0 = ((mlib_s32 *)&ds0)[0] + ((mlib_s32 *)&ds0)[1];
			SATUR(s0);

			if (dst_type) {
				((mlib_s16 *)z)[i] = s0;
			} else {
				((mlib_u8 *)z)[i] = s0;
			}

			if (i + 1 < n) {
				s1 = ((mlib_s32 *)&ds1)[0] +
					((mlib_s32 *)&ds1)[1];
				SATUR(s1);

				if (dst_type) {
					((mlib_s16 *)z)[i + 1] = s1;
				} else {
					((mlib_u8 *)z)[i + 1] = s1;
				}
			}

			pbuff_y += 2 * l4;
		}

		z = (mlib_u8 *)z + ((dst_type) ? (2 * n) : n);
	}

	if (size > MAX_SIZE) {
		__mlib_free(buff_y);
	}

	return (MLIB_SUCCESS);
}
示例#27
0
mlib_status
__mlib_VectorConvert_S16_S8_Mod(
	mlib_s16 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_s8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 fzero = vis_fzeros();
	mlib_s32 len_64, even_length, rest_64, length = n, off;
	mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7;

	if (length < 16) {
		EXPAND(mlib_s8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];
	off = (mlib_addr)src & 7;

	if (!off) {

/*
 * Both vectors are 64-bit aligned.
 */

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2),
				fzero), four_16_ones);
		}
	} else {

/*
 * Source vector is not 64-bit aligned.
 * Peeling of 1 iteration. Then loop with step==2.
 */

		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd4 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i <= len_64; i += 2) {
			dd0 = dd2;
			dd1 = vis_ld_d64_nf(ddsrc + i);
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_bshuffle(dd0, dd1);
			dd6 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			dd7 = vis_faligndata(dd6, dd6);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd6, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd7, four_16_ones);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
示例#28
0
mlib_status
__mlib_MatrixMul_S16_S8_Mod(
	mlib_s16 *z,
	const STYPE * x,
	const STYPE * y,
	mlib_s32 m,
	mlib_s32 l,
	mlib_s32 n)
{
	mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y;
	mlib_d64 array[MAX_SIZE];
	mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3;
	mlib_s32 size, i, j, k, l8;

	if (!((m > 0) && (l > 0) && (n > 0))) {
		return (MLIB_FAILURE);
	}

	l8 = (l + 7) / 8;
	size = l8 * n + 2 * l8 + 4;

	if (size <= MAX_SIZE) {
		buff_y = array;
	} else {
		buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64));

		if (buff_y == NULL) {
			return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat,
				x, y, m, l, n, n, z);
		}
	}

	buff_x = buff_y + l8 * n;
	pbuff_y = buff_y;

/* transpose y matrix */
	for (i = 0; i < n; i++) {
		mlib_u8 *py = (mlib_u8 *)y + i;
		mlib_u8 *pp = (mlib_u8 *)pbuff_y;

		for (j = 0; j <= (l - 4); j += 4) {
			((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080;
			((mlib_s16 *)pp)[1] =
				((py[2 * n] << 8) | py[3 * n]) ^ 0x8080;
			py += 4 * n;
			pp += 4;
		}

		for (; j < l; j++) {
			(*pp++) = *py ^ 0x80;
			py += n;
		}

		for (; j < 8 * l8; j++) {
			(*pp++) = 0;
		}

		pbuff_y += l8;
	}

	for (j = 0; j < m; j++) {
		mlib_s32 x_sum = 0;

		for (i = 0; i < l; i++) {
			x_sum += x[i];
		}

		x_sum <<= 7;

		pbuff_x = buff_x;
		pbuff_y = buff_y;

/* copy x line */
		px = vis_alignaddr((void *)x, 0);
		x1 = vis_ld_d64_nf(px);
		px++;
		xx = 0;
		for (i = 0; i < l8; i++) {
			x0 = x1;
			x1 = vis_ld_d64_nf(px);
			px++;
			xx = vis_faligndata(x0, x1);
			pbuff_x[2 * i] =
				vis_fpmerge(vis_read_hi(xx), vis_fzeros());
			pbuff_x[2 * i + 1] =
				vis_fpmerge(vis_read_lo(xx), vis_fzeros());
		}

/* loop on y lines */
		for (i = 0; i < n; i += 2) {
			mlib_d64 *px = pbuff_x;
			mlib_d64 *py0 = pbuff_y;
			mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0;

			ds0 = ds1 = vis_fzero();

			LOAD;
			MUL;
			LOAD;

#pragma pipeloop(0)
			for (k = 0; k < l8; k++) {
				SUM;
				MUL;
				LOAD;
			}

			ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0),
				vis_read_lo(ds0)),
				vis_fpadd16s(vis_read_hi(ds1),
				vis_read_lo(ds1)));

			z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] -
				x_sum;

			if (i + 1 < n) {
				z[i + 1] =
					((mlib_s16 *)&ds0)[2] +
					((mlib_s16 *)&ds0)[3] - x_sum;
			}

			pbuff_y += 2 * l8;
		}

		z += n;
		x += l;
	}

	if (size > MAX_SIZE) {
		__mlib_free(buff_y);
	}

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageChannelMerge4_S16(
    mlib_s16 *dst_s16_0,
    const mlib_s16 *src_s16_0,
    const mlib_s16 *src_s16_1,
    const mlib_s16 *src_s16_2,
    const mlib_s16 *src_s16_3,
    mlib_s32 height,
    mlib_s32 width,
    mlib_s32 dst_stride,
    mlib_s32 src0_stride,
    mlib_s32 src1_stride,
    mlib_s32 src2_stride,
    mlib_s32 src3_stride)
{
	mlib_s32 i, j, n = width << 2;
	mlib_s16 *fi_ptr, *se_ptr, *th_ptr, *fo_ptr;
	mlib_d64 *dp;

	for (j = 0; j < height; j++) {

		i = 0;

		if ((mlib_addr)(dst_s16_0 + i) & 7) {
			dst_s16_0[i++] = src_s16_0[0];

			if ((mlib_addr)(dst_s16_0 + i) & 7) {
				dst_s16_0[i++] = src_s16_1[0];

				if ((mlib_addr)(dst_s16_0 + i) & 7) {
					dst_s16_0[i++] = src_s16_2[0];
				}
			}
		}

		if (i == 0) {
			fi_ptr = (mlib_s16 *)src_s16_0;
			se_ptr = (mlib_s16 *)src_s16_1;
			th_ptr = (mlib_s16 *)src_s16_2;
			fo_ptr = (mlib_s16 *)src_s16_3;
		} else if (i == 1) {
			fi_ptr = (mlib_s16 *)src_s16_1;
			se_ptr = (mlib_s16 *)src_s16_2;
			th_ptr = (mlib_s16 *)src_s16_3;
			fo_ptr = (mlib_s16 *)(src_s16_0 + 1);
		} else if (i == 2) {
			fi_ptr = (mlib_s16 *)src_s16_2;
			se_ptr = (mlib_s16 *)src_s16_3;
			th_ptr = (mlib_s16 *)(src_s16_0 + 1);
			fo_ptr = (mlib_s16 *)(src_s16_1 + 1);
		} else if (i == 3) {
			fi_ptr = (mlib_s16 *)src_s16_3;
			se_ptr = (mlib_s16 *)(src_s16_0 + 1);
			th_ptr = (mlib_s16 *)(src_s16_1 + 1);
			fo_ptr = (mlib_s16 *)(src_s16_2 + 1);
		}

		dp = (mlib_d64 *)(dst_s16_0 + i);

		if ((n - i) > 16) {
			if (((mlib_addr)fi_ptr & 7) ||
			    ((mlib_addr)se_ptr & 7) ||
			    ((mlib_addr)th_ptr & 7) ||
			    ((mlib_addr)fo_ptr & 7)) {

				mlib_d64 sd0, sd1, sd2, sd3;
				mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13;
				mlib_d64 s0h, s0l, s1h, s1l, s2h, s2l, s3h, s3l;
				mlib_d64 *sp0;
				mlib_d64 *sp1;
				mlib_d64 *sp2;
				mlib_d64 *sp3;

				sp0 = VIS_ALIGNADDR(fi_ptr, 0);
				s0h = (*sp0++);
				sp1 = VIS_ALIGNADDR(se_ptr, 0);
				s1h = (*sp1++);
				sp2 = VIS_ALIGNADDR(th_ptr, 0);
				s2h = (*sp2++);
				sp3 = VIS_ALIGNADDR(fo_ptr, 0);
				s3h = (*sp3++);

#pragma pipeloop(0)
				for (; i < (n - 15); i += 16) {
					s0l = vis_ld_d64_nf(sp0); sp0++;
					s1l = vis_ld_d64_nf(sp1); sp1++;
					s2l = vis_ld_d64_nf(sp2); sp2++;
					s3l = vis_ld_d64_nf(sp3); sp3++;

					VIS_ALIGNADDR(fi_ptr, 0);
					sd0 = vis_faligndata(s0h, s0l);

					VIS_ALIGNADDR(se_ptr, 0);
					sd1 = vis_faligndata(s1h, s1l);

					VIS_ALIGNADDR(th_ptr, 0);
					sd2 = vis_faligndata(s2h, s2l);

					VIS_ALIGNADDR(fo_ptr, 0);
					sd3 = vis_faligndata(s3h, s3l);

					dr02 =
					    vis_fpmerge(vis_read_hi(sd0),
					    vis_read_hi(sd2));
					dr13 =
					    vis_fpmerge(vis_read_hi(sd1),
					    vis_read_hi(sd3));
					dd0 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[0] =
					    vis_fpmerge(vis_read_hi(dd0),
					    vis_read_lo(dd0));
					dd1 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[1] =
					    vis_fpmerge(vis_read_hi(dd1),
					    vis_read_lo(dd1));
					dr02 =
					    vis_fpmerge(vis_read_lo(sd0),
					    vis_read_lo(sd2));
					dr13 =
					    vis_fpmerge(vis_read_lo(sd1),
					    vis_read_lo(sd3));
					dd2 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[2] =
					    vis_fpmerge(vis_read_hi(dd2),
					    vis_read_lo(dd2));
					dd3 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[3] =
					    vis_fpmerge(vis_read_hi(dd3),
					    vis_read_lo(dd3));
					dp += 4;

					s0h = s0l;
					s1h = s1l;
					s2h = s2l;
					s3h = s3l;

					fi_ptr += 4;
					se_ptr += 4;
					th_ptr += 4;
					fo_ptr += 4;
				}
			} else {

				mlib_d64 sd0, sd1, sd2, sd3;
				mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13;

#pragma pipeloop(0)
				for (; i < (n - 15); i += 16) {

					sd0 = ((mlib_d64 *)fi_ptr)[0];
					sd1 = ((mlib_d64 *)se_ptr)[0];
					sd2 = ((mlib_d64 *)th_ptr)[0];
					sd3 = ((mlib_d64 *)fo_ptr)[0];

					dr02 =
					    vis_fpmerge(vis_read_hi(sd0),
					    vis_read_hi(sd2));
					dr13 =
					    vis_fpmerge(vis_read_hi(sd1),
					    vis_read_hi(sd3));
					dd0 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[0] =
					    vis_fpmerge(vis_read_hi(dd0),
					    vis_read_lo(dd0));
					dd1 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[1] =
					    vis_fpmerge(vis_read_hi(dd1),
					    vis_read_lo(dd1));
					dr02 =
					    vis_fpmerge(vis_read_lo(sd0),
					    vis_read_lo(sd2));
					dr13 =
					    vis_fpmerge(vis_read_lo(sd1),
					    vis_read_lo(sd3));
					dd2 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[2] =
					    vis_fpmerge(vis_read_hi(dd2),
					    vis_read_lo(dd2));
					dd3 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[3] =
					    vis_fpmerge(vis_read_hi(dd3),
					    vis_read_lo(dd3));
					dp += 4;

					fi_ptr += 4;
					se_ptr += 4;
					th_ptr += 4;
					fo_ptr += 4;
				}
			}
		}
#pragma pipeloop(0)
		for (; i < (n - 3); i += 4) {
			dst_s16_0[i + 0] = (*fi_ptr++);
			dst_s16_0[i + 1] = (*se_ptr++);
			dst_s16_0[i + 2] = (*th_ptr++);
			dst_s16_0[i + 3] = (*fo_ptr++);
		}

		if (i < (n - 2)) {
			dst_s16_0[i + 0] = *fi_ptr;
			dst_s16_0[i + 1] = *se_ptr;
			dst_s16_0[i + 2] = *th_ptr;
		} else if (i < (n - 1)) {
			dst_s16_0[i + 0] = *fi_ptr;
			dst_s16_0[i + 1] = *se_ptr;
		} else if (i < n) {
			dst_s16_0[i + 0] = *fi_ptr;
		}

		dst_s16_0 += dst_stride;
		src_s16_0 += src0_stride;
		src_s16_1 += src1_stride;
		src_s16_2 += src2_stride;
		src_s16_3 += src3_stride;
	}
	return (MLIB_SUCCESS);
}
void
__mlib_VideoColorYUV444seq_to_UYVY422int(
	mlib_u32 *uyvy,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 w,
	mlib_s32 h,
	mlib_s32 dlb,
	mlib_s32 slb)
{
	mlib_s32 i, j2, val_y0, val_y1, val_u0, val_v0, count, left;

	dlb >>= 2;
	w >>= 1;

	if (w == 0 || h == 0)
		return;

	count = w >> 2;
	left = w - (count << 2);

	vis_write_gsr(6 << 3);

	for (i = 0; i < h; i++, y += slb, u += slb, v += slb, uyvy += dlb) {
		if ((((mlib_addr)u | (mlib_addr)v | (mlib_addr)y | (mlib_addr)
			uyvy) & 7) == 0) {
			mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0,
				w_acc1;
			mlib_f32 v_one = vis_to_float(0x1000000);
			mlib_f32 v_u, v_v;
			mlib_s32 j;

#pragma pipeloop(0)
			for (j = 0; j < count; j++) {
				w_y = ((mlib_d64 *)y)[j];
				w_u = ((mlib_d64 *)u)[j];
				w_v = ((mlib_d64 *)v)[j];

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				((mlib_d64 *)uyvy)[2 * j] =
					VIS_FPMERGE_HI(w_uv, w_y);
				((mlib_d64 *)uyvy)[2 * j + 1] =
					VIS_FPMERGE_LO(w_uv, w_y);
			}

			if (left) {
				mlib_d64 res_buf[2];

				w_y = ((mlib_d64 *)y)[count];
				w_u = ((mlib_d64 *)u)[count];
				w_v = ((mlib_d64 *)v)[count];

				w_tmp0 = vis_fpmerge(vis_read_hi(w_u),
					vis_read_lo(w_u));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));

				w_tmp0 = vis_fpmerge(vis_read_hi(w_v),
					vis_read_lo(w_v));
				w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0),
					vis_read_lo(w_tmp0));

				w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1),
					v_one);
				w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1),
					v_one);

				v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1));
				w_uv = vis_fpmerge(v_u, v_v);

				res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y);
				res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y);

				for (j = 0; j < left; j++) {
					((mlib_f32 *)uyvy)[4 * count + j] =
						((mlib_f32 *)res_buf)[j];
				}
			}
		} else {

#pragma pipeloop(0)
			for (j2 = 0; j2 < w; j2++) {
				mlib_s32 j = 2 * j2;
				mlib_s32 j1 = j + 1;

				val_y0 = y[j];
				val_y1 = y[j1];
				val_u0 = (u[j] + u[j1]) >> 1;
				val_v0 = (v[j] + v[j1]) >> 1;
				uyvy[j2] =
					(val_u0 << 24) | (val_y0 << 16) |
					(val_v0 << 8) | val_y1;
			}
		}
	}
}