void
mlib_ImageLineXor8000(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_d64 *dptr;
	mlib_d64 *sptr;
	mlib_d64 s0, s1;
	mlib_d64 mask8000 = vis_to_double_dup(0x80008000);
	mlib_s32 j;
	mlib_s32 emask;

/* prepare the destination addresses */
	dptr = (mlib_d64 *)((mlib_addr)dst & (~7));
	j = (mlib_addr)dptr - (mlib_addr)dst;
	dend = (mlib_u8 *)dst + size - 1;

/* prepare the source address */
	sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j);
/* generate edge mask for the start point */
	emask = vis_edge8(dst, dend);

	s1 = vis_ld_d64_nf(sptr);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		s0 = vis_fxor(vis_faligndata(s0, s1), mask8000);
		vis_pst_8(s0, dptr++, emask);
		sptr++;
		j += 8;
	}

#pragma pipeloop(0)
	for (; j <= (size - 16); j += 8) {
		s0 = s1;
		s1 = sptr[1];
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
	}

	if (j <= (size - 8)) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
		j += 8;
	}

	if (j < size) {
		s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)),
		    mask8000);
		emask = vis_edge8(dptr, dend);
		vis_pst_8(s0, dptr, emask);
	}
}
Example #2
0
void
mlib_v_ImageNot_na(
    mlib_u8 *sa,
    mlib_u8 *da,
    mlib_s32 size)
{
/* end points in dst */
	mlib_u8 *dend;

/* 8-byte aligned start points in dst */
	mlib_d64 *dp;

/* 8-byte aligned start point in src */
	mlib_d64 *sp;

/* 8-byte source data */
	mlib_d64 s0, s1;

/* offset of address in dst */
	mlib_s32 j;

/* edge mask */
	mlib_s32 emask;

/* prepare the destination addresses */
	dp = (mlib_d64 *)((mlib_addr)da & (~7));
	j = (mlib_addr)dp - (mlib_addr)da;
	dend = da + size - 1;

/* prepare the source address */
	sp = (mlib_d64 *)vis_alignaddr(sa, j);
/* generate edge mask for the start point */
	emask = vis_edge8(da, dend);

	s1 = vis_ld_d64_nf(sp);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		s0 = vis_faligndata(s0, s1);
		vis_pst_8(vis_fnot(s0), dp++, emask);
		sp++;
		j += 8;
	}
#pragma pipeloop(0)
	for (; j <= (size - 8); j += 8) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		(*dp++) = vis_fnot(vis_faligndata(s0, s1));
		sp++;
	}

	if (j < size) {
		s0 = vis_faligndata(s1, vis_ld_d64_nf(sp + 1));
		emask = vis_edge8(dp, dend);
		vis_pst_8(vis_fnot(s0), dp, emask);
	}
}
mlib_status
__mlib_VideoUpSample420_Nearest(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)src;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 sa, da;
	mlib_s32 emask, i;

	if (n <= 0)
		return (MLIB_FAILURE);

#pragma pipeloop(0)
	for (i = 0; i <= (n - 8); i += 8) {
		sa = *sp;
		*dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		*(dp0 + 1) = *(dp1 + 1) =
			vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
		sp++;
		dp0 += 2;
		dp1 += 2;
	}

	if (i < n) {
		sa = vis_ld_d64_nf(sp);
		da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		emask = vis_edge8(dp0, dend0);
		vis_pst_8(da, dp0, emask);
		vis_pst_8(da, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
			emask = vis_edge8(dp0, dend0);
			vis_pst_8(da, dp0, emask);
			vis_pst_8(da, dp1, emask);
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorSplit3(
	mlib_u8 *color1,
	mlib_u8 *color2,
	mlib_u8 *color3,
	const mlib_u8 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 *dp2 = (mlib_d64 *)color3;
	mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2;
	mlib_d64 sda, sdb, sdc, sdd, sde;
	mlib_s32 i;

/*
 * 8-pixels loop
 */
#pragma pipeloop(0)
	for (i = 0; i < (n / 8); i++) {
		sd0 = (*sp++);
		sd1 = (*sp++);
		sd2 = (*sp++);
		MLIB_SPLIT3_U8(sd0, sd1, sd2, dd0, dd1, dd2);
		(*dp0++) = dd0;
		(*dp1++) = dd1;
		(*dp2++) = dd2;
	}

/*
 * last 8 pixels
 */

	if (n & 7) {
		mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF;

		sd0 = (*sp++);
		sd1 = vis_ld_d64_nf(sp); sp++;
		sd2 = vis_ld_d64_nf(sp);
		MLIB_SPLIT3_U8(sd0, sd1, sd2, dd0, dd1, dd2);
		vis_pst_8(dd0, dp0, emask);
		vis_pst_8(dd1, dp1, emask);
		vis_pst_8(dd2, dp2, emask);
	}
mlib_status
__mlib_VideoColorSplit4(
	mlib_u8 *color1,
	mlib_u8 *color2,
	mlib_u8 *color3,
	mlib_u8 *color4,
	const mlib_u8 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 *dp2 = (mlib_d64 *)color3;
	mlib_d64 *dp3 = (mlib_d64 *)color4;
	mlib_d64 sd01, sd23, sd45, sd67, dd0, dd1, dd2, dd3;
	mlib_d64 sd04, sd26, sd15, sd37, dh0, dh1, dl0, dl1;
	mlib_s32 i;

	MLIB_LOAD_PREP_U8(sp);
/*
 * 8-pixels loop
 */
#pragma pipeloop(0)
	for (i = 0; i < (n / 8); i++) {
		MLIB_LOAD_SPLIT4_U8(sp, dd0, dd1, dd2, dd3);
		(*dp0++) = dd0;
		(*dp1++) = dd1;
		(*dp2++) = dd2;
		(*dp3++) = dd3;
	}

/*
 * last 8 pixels
 */

	if (n & 7) {
		mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF;

		MLIB_LOAD_SPLIT4_U8(sp, dd0, dd1, dd2, dd3);
		vis_pst_8(dd0, dp0, emask);
		vis_pst_8(dd1, dp1, emask);
		vis_pst_8(dd2, dp2, emask);
		vis_pst_8(dd3, dp3, emask);
	}
mlib_status
__mlib_SignalConvertShift_U8_F32_Sat(
    mlib_u8 *dst,
    const mlib_f32 *src,
    mlib_s32 shift,
    mlib_s32 xsize)
{
	mlib_s32 i, off;
	mlib_d64 *sp, *dp;
	mlib_d64 dd, dd_old;
	type_union_mlib_d64 sd0, sd1, sd2, sd3;
	mlib_f32 fl_c;

	if (xsize <= 0)
		return (MLIB_FAILURE);
	if (!src || !dst)
		return (MLIB_NULLPOINTER);

	PREPARE_CONST(fl_c, shift + 8);

	if ((mlib_addr)src & 7) {
		mlib_f32 x = (*src++) * fl_c;

		if (x >= MLIB_U8_MAX)
			x = MLIB_U8_MAX;
		if (x <= MLIB_U8_MIN)
			x = MLIB_U8_MIN;
		(*dst++) = x;
		xsize--;
	}

	vis_write_gsr(23 << 3);

	off = ((mlib_addr)dst & 7);
	sp = (mlib_d64 *)src;
	dp = (mlib_d64 *)(dst - off);

	if (off == 0) {
#pragma pipeloop(0)
		for (i = 0; i <= (xsize - 8); i += 8) {
			CONVERT_U8_F32();
			(*dp++) = dd;
		}

		if (i < xsize) {
			mlib_s32 emask = 0xFF00 >> (xsize - i);

			CONVERT_U8_F32();
			vis_pst_8(dd, dp, emask);
		}
	} else {
Example #7
0
void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, dmask, dFF;
    mlib_s32 i, i0, j, x, mask;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    dmask = vis_to_double_dup(0xFFFFFF);
    dFF = vis_to_double_dup(0xFFFFFFFF);

    for (j = 0; j < height; j++) {
	mlib_s32 *src = srcBase;
	mlib_s32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    mlib_u8 *pp0 = (mlib_u8*)(src + i);
	    mlib_u8 *pp1 = (mlib_u8*)(src + i + 1);
	    dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1);
	    dd = vis_fand(dd, dmask);
#if 1
	    mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3);
	    *(mlib_d64*)(dst + i) = dd;
	    vis_pst_8(dFF, dst + i, mask);
#else
	    mask = ((*pp0 & 1) << 1) | (*pp1 & 1);
	    dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]);
	    *(mlib_d64*)(dst + i) = dd;
#endif
	}

	if (i < width) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	}
mlib_status
mlib_v_conv3x3_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2, d21, d22;

/* data */
	mlib_d64 d3, d_3, d23;
	mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1, sd00;
	mlib_s32 emask, cmask1;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	cmask = ((cmask & 0xf) << 4) + (cmask & 0xf);
	cmask = (cmask << 8) + (cmask);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 2;
	dw *= 4;
	dh -= 2;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	d_a = adr_dst + dlb + 4;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf3, sa2);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(8);

		vis_alignaddr(s1, 4);
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k3k4);
			CONV_AU(d3, k7k8);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			d23 = vis_faligndata(d3, d_3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k5k6);
			CONV_AL(d23, k7k8);
			CONV_AU(d_1, k3k4);
			CONV_AL(d_2, k5k6);
			CONV_AU(d_3, k9k9);
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = dbuf;
/* prepare the destination addresses */
		dp = (mlib_d64 *)((mlib_addr)da & (~7));
		i = (mlib_addr)dp - (mlib_addr)da;
		cmask1 = cmask >> (-i);
		ddst = vis_alignaddr(ddst, i);
/* generate edge mask for the start point */
		emask = vis_edge8(da, dend);
		sd1 = ddst[0];

		if (emask != 0xff) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd0 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd0, dp++, emask & cmask1);
			ddst++;
			i += 8;
		}
#pragma pipeloop(0)
		for (; i <= (dw - 8); i += 8) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd00 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd00, dp++, cmask1);
			ddst++;
		}

		if (i < dw) {
			sd0 = vis_faligndata(sd1, ddst[1]);
			emask = vis_edge8(dp, dend);
			vis_pst_8(sd0, dp, emask & cmask1);
		}

		sa2 = sa2 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUp_S16_U8_124_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 *table0,
    const mlib_u8 *table1,
    const mlib_u8 *table2,
    const mlib_u8 *table3)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* source data */
	mlib_s32 s4, s5, s6, s7;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;

/* destination data */
	mlib_d64 acc1;

	dl = dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;
	sp = (void *)src;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 8) {

		s0 = sp[0];
		s1 = sp[1];
		s2 = sp[2];
		s3 = sp[3];
		s4 = sp[4];
		s5 = sp[5];
		s6 = sp[6];
		s7 = sp[7];
		sp += 8;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
			t7 = VIS_LD_U8_I(table3, s7);
			t6 = VIS_LD_U8_I(table2, s6);
			t5 = VIS_LD_U8_I(table1, s5);
			t4 = VIS_LD_U8_I(table0, s4);
			t3 = VIS_LD_U8_I(table3, s3);
			t2 = VIS_LD_U8_I(table2, s2);
			t1 = VIS_LD_U8_I(table1, s1);
			t0 = VIS_LD_U8_I(table0, s0);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0];
			s1 = sp[1];
			s2 = sp[2];
			s3 = sp[3];
			s4 = sp[4];
			s5 = sp[5];
			s6 = sp[6];
			s7 = sp[7];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(table3, s7);
		t6 = VIS_LD_U8_I(table2, s6);
		t5 = VIS_LD_U8_I(table1, s5);
		t4 = VIS_LD_U8_I(table0, s4);
		t3 = VIS_LD_U8_I(table3, s3);
		t2 = VIS_LD_U8_I(table2, s2);
		t1 = VIS_LD_U8_I(table1, s1);
		t0 = VIS_LD_U8_I(table0, s0);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_addr)dend - (mlib_addr)dp;
		sp += num;
		num++;

		if ((num & 3) == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num--;
		} else if ((num & 3) == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 2;
		} else if ((num & 3) == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 3;
		}

		if (num != 0) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table3, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge8(dp, dend);
		vis_pst_8(acc0, dp, emask);
	}
}
mlib_status
__mlib_VideoUpSample420(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src0,
	const mlib_u8 *src1,
	const mlib_u8 *src2,
	mlib_s32 n)
{
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d10, d11, d20, d21;
	mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo;
	mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo;
	mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo;
	mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo;
	mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
	mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7;
	mlib_d64 data0, data1, data2, data3, tmp0, tmp1;
	mlib_f32 fone = vis_to_float(0x4000000);
	mlib_f32 fthree = vis_to_float(0xC000000);
	mlib_f32 fone1 = vis_to_float(0x40404040);
	mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0);
	mlib_d64 dseven = vis_to_double_dup(0x70007);
	mlib_d64 deight = vis_to_double_dup(0x80008);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((3 << 3) + 2);

	d00 = vis_ld_d64_nf(sp0);
	d10 = vis_ld_d64_nf(sp1);
	d20 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone);
	lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone);
	lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone);
	lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone);
	tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree);
	tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree);
	lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0);
	lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1);
	lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0);
	lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1);

#pragma pipeloop(0)
	for (i = 0; i < n - 8; i += 8) {
		d01 = *sp0;
		d11 = *sp1;
		d21 = *sp2;
		sp0++;
		sp1++;
		sp2++;

		thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone);
		thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone);
		thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone);
		thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone);

		tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree);
		tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree);

		thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0);
		thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1);
		thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0);
		thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1);

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, thiscolsum0_hi);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, thiscolsum1_hi);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0));
		dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));
		dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2));

		dp0 += 2;
		dp1 += 2;
		lastcolsum0_hi = thiscolsum0_hi;
		lastcolsum0_lo = thiscolsum0_lo;
		lastcolsum1_hi = thiscolsum1_hi;
		lastcolsum1_lo = thiscolsum1_lo;
	}

	if (i < n) {

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, lastcolsum0_lo);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, lastcolsum1_lo);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));

		emask = vis_edge8(dp0, dend0);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			acc0 = vis_fpmerge(vis_read_lo(data1),
				vis_read_lo(data0));
			acc1 = vis_fpmerge(vis_read_lo(data3),
				vis_read_lo(data2));

			emask = vis_edge8(dp0, dend0);
			vis_pst_8(acc0, dp0, emask);
			vis_pst_8(acc1, dp1, emask);
		}
	}

	vis_write_gsr(7);

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;

	ac0 = *dp0;
	ac2 = *dp1;

#pragma pipeloop(0)
	for (i = 0; i < 2 * n - 8; i += 8) {
		ac1 = *dp0;
		ac3 = *dp1;
		*dp0 = vis_faligndata(ac0, ac1);
		*dp1 = vis_faligndata(ac2, ac3);
		dp0++;
		dp1++;
		ac0 = ac1;
		ac2 = ac3;
	}

	if (i < 2 * n) {
		ac1 = vis_ld_d64_nf(dp0);
		ac3 = vis_ld_d64_nf(dp1);
		emask = vis_edge8(dp0, dend0);
		acc0 = vis_faligndata(ac0, ac1);
		acc1 = vis_faligndata(ac2, ac3);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
Example #11
0
static void
mlib_VectorDotProd_U8C_al_x(
	mlib_d64 *z,
	const void *x,
	const void *y,
	mlib_s32 n)
/* The case of even address of vector x */
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y;
	mlib_d64 sum_r = 0.0, sum_i = 0.0;
	mlib_d64 *dpx, *dpy, *dpxend;
	mlib_d64 dx, dy, dy0, dy1;
	mlib_d64 dx_r, dy_r, dy_i;
	mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl;
	mlib_d64 d_ih, d_il, d_rh, d_rl;
	mlib_d64 ds_r, ds_i, ds1_r, ds1_i;
	mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 edge[2];
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask, off;
	mlib_d64 done = vis_to_double_dup(0x1000100);

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	off = (mlib_addr)dpx - (mlib_addr)px;
	dpy = vis_alignaddr((void *)py, off);
	pxend = px + n + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	dy = vis_ld_d64_nf(dpy);

	if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) {
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dx = dpx[1];
				dy = dpy[1];
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}

	} else {
		dy1 = vis_ld_d64_nf(dpy+1);
		dy = vis_faligndata(dy, dy1);
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy+2);
				dx = vis_ld_d64_nf(dpx+1);
				dy = vis_faligndata(dy0, dy1);
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		DPROD_U8C;
		SUM_U8C_TAIL;
		fsum = vis_read_hi(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = sum_r;
	z[1] = sum_i;
#undef MAX_LOOP
}
void
mlib_v_ImageColorRGB2Mono_U8_D1(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 dsize,
    const mlib_d64 *weight)
{
    mlib_u8 *dst_end;
    mlib_d64 dd, d0, d1, d2, d3;
    mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt;
    mlib_d64 *src_all, *dp;
    mlib_f32 d32, e32, alpha, gamma, beta;
    mlib_d64 sd0, sd1, sd2;
    mlib_s32 i, emask;
    mlib_s32 off;
    mlib_s32 mask0 = 0x0369147a;
    mlib_s32 mask1 = 0x258b258b;
    mlib_s32 mask2 = 0x47ad58be;
    mlib_s32 mask3 = 0x69cf69cf;

    /* prepare the weight */
    alpha = vis_to_float(weight[0] * 8192);
    beta = vis_to_float(weight[1] * 8192);
    gamma = vis_to_float(weight[2] * 8192);
    vis_write_gsr(2 << 3);

    dp = (mlib_d64 *)((mlib_addr)dst & (~7));
    off = (mlib_addr)dp - (mlib_addr)dst;

    dst_end = dst + (dsize - 1);
    emask = vis_edge8(dst, dst_end);
    src_all = vis_alignaddr((void *)src, (3 * off));

    d0 = (*src_all++);
    d1 = (*src_all++);
    d2 = (*src_all++);
    d3 = (*src_all++);

    sd0 = vis_faligndata(d0, d1);
    sd1 = vis_faligndata(d1, d2);
    sd2 = vis_faligndata(d2, d3);

    CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
    CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
    vis_pst_8(dd, dp, emask);
    dp++;

#pragma pipeloop(0)
    for (i = 8 + off; i <= (dsize - 8); i += 8) {
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);

        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        (*dp++) = dd;
    }

    if ((mlib_addr)dp <= (mlib_addr)dst_end) {

        emask = vis_edge8(dp, dst_end);
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);
        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        vis_pst_8(dd, dp, emask);
    }
}
Example #13
0
mlib_status
__mlib_VectorSubS_U8_U8_Mod(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *c,
	mlib_s32 n)
{
/* edge masks */
	mlib_s32 emask;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_s8 *pzend;
	mlib_d64 *dpx, *dpz, *dpzend;
	mlib_d64 dx, dx0, dx1, dr0, dr1, dr;
	mlib_u8 uc = *((mlib_s8 *)c);
	mlib_d64 uncontrol_mask = vis_to_double_dup(0xff00ff00);

/* prepare the scaling factors */
	mlib_d64 dcl = vis_to_double_dup(uc | (uc << 16));
	mlib_d64 dch = vis_to_double_dup((uc << 8) | (uc << 24));
	mlib_s32 scal = uc << 24 | uc << 16 | uc << 8 | uc;
	mlib_s32 sr1, sr2, sr3, sr3_;
	mlib_s32 mask = 0x7f7f7f7f;
	mlib_s32 x8, x12;
	mlib_s32 nrest, i;

	if (n <= 0)
		return (MLIB_FAILURE);

	pzend = (mlib_s8 *)z + n - 1;
	dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7));

/*
 * check for 64-bit aligned special case
 */

	if ((((mlib_addr)x | (mlib_addr)z) & 7) == 0) {

/*
 * We can process source and destination vectors by 16 bytes.
 */

		dpx = (mlib_d64 *)x;
		dpz = (mlib_d64 *)z;
#pragma pipeloop(0)
		for (i = 0; i < n >> 4; i++) {
			mlib_u64 ld0;

			dx = dpx[0];
			SUBS_S8_MOD;
			(*dpz++) = dr;

			ld0 = *((mlib_u64 *)dpx + 1);
			x8 = ld0 >> 32;
			sr1 = x8 ^ ~scal;
			sr2 = (scal | ~mask) - (x8 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			x12 = ld0 & 0xFFFFFFFF;
			sr1 = x12 ^ ~scal;
			sr2 = (scal | ~mask) - (x12 & mask);
			sr3_ = (sr1 & ~mask) ^ sr2;
			(*dpz++) = vis_to_double(sr3, sr3_);
			dpx += 2;
		}

		nrest = n & 0xf;

		if (nrest >= 8) {
			dx = (*dpx++);
			SUBS_S8_MOD;
			(*dpz++) = dr;
			nrest -= 8;
		}

		if (nrest > 0) {
			dx = (*dpx++);
			SUBS_S8_MOD;
			emask = vis_edge8(dpz, pzend);
			vis_pst_8(dr, dpz, emask);
		}

	} else {
Example #14
0
mlib_status
__mlib_VectorSub_S8_S8_Sat(
	mlib_s8 *z,
	const mlib_s8 *x,
	const mlib_s8 *y,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx, *dpy;
	mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1;
	mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl;
	mlib_d64 dh, dl;
	mlib_s8 *pz = z, *px, *py, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;

/* edge masks */
	mlib_s32 emask;
	mlib_d64 displacement = vis_to_double_dup(0x8000800);
	mlib_d64 restore = vis_to_double_dup(0x80808080);
	mlib_f32 fmul = vis_to_float(0x1000);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s8 *)x;
	py = (mlib_s8 *)y;

/* initialize GSR scale factor */
	vis_write_gsr(3 << 3);

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

	if (off) {
		dpy = (mlib_d64 *)vis_alignaddr(py, off);
		dy0 = vis_ld_d64_nf(dpy);
		dy1 = vis_ld_d64_nf(dpy + 1);
		dy = vis_faligndata(dy0, dy1);
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		SUB_S8_SAT;

/*
 * store first bytes of result
 */

		vis_pst_8(dz, dpz, emask);

		px += (8 + off);
		py += (8 + off);
		len -= (8 + off);
		dpz++;

		if (len <= 0)
			return (MLIB_SUCCESS);
	}

	even_8 = len >> 3;
	rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;
		dpy = (mlib_d64 *)py;

		dx = vis_ld_d64_nf(dpx);
		dpx++;
		dy = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dy1 = vis_ld_d64_nf(dpy);
			SUB_S8_SAT;
			dx = dx1;
			dy = dy1;
/*
 * store 8 bytes of result
 */
			dpz[0] = dz;
			dpx++;
			dpy++;
			dpz++;
		}

		dx1 = dx;
		dy1 = dy;
	} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

		dpx = (mlib_d64 *)px;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		dx = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			dx = vis_ld_d64_nf(dpx);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpx++;
			dpy++;
		}

		dx1 = dx;
		dy1 = dy0;
	} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

		dpy = (mlib_d64 *)py;
		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy = (*dpy++);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}

		dy1 = vis_ld_d64_nf(dpy);
		dpy++;
	} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dpy = vis_alignaddr(py, 0);
		dy1 = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
			dy = vis_faligndata(dy0, dy1);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}
	} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);

/* #pragma pipeloop(0) */
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			vis_alignaddr(py, 0);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			vis_alignaddr(px, 0);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpy++;
			dpx++;
		}

		dx1 = dx0;
		dy1 = dy0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

	vis_alignaddr(px, 0);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_faligndata(dx0, dx1);
	vis_alignaddr(py, 0);
	dy0 = dy1;
	dy1 = vis_ld_d64_nf(dpy);
	dy = vis_faligndata(dy0, dy1);
	SUB_S8_SAT;

/*
 * prepare edge mask for the last bytes
 */

	emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
	vis_pst_8(dz, dpz, ~emask);

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorCMYK2JFIFYCCK444(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	mlib_u8 *k,
	const mlib_u8 *cmyk,
	mlib_s32 n)
{
	mlib_d64 buff_arr[(SIZE / 2) + 2];
	mlib_f32 *py, *pcb, *pcr, *pk;
	mlib_d64 *buff;
	mlib_d64 sdh, sdl, dr, dg, db, dd;
	mlib_s32 i, m, size, num;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 8192));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 8192));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 8192));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 8192));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 8192));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 8192));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off255 = vis_to_double_dup(0x1ff01ff0);

	vis_write_gsr(2 << 3);

/*
 * 4-pixel loop
 */
	for (size = 0; size < n; size += num) {

		num = n - size;

		if (num > SIZE)
			num = SIZE;

		m = (num + 3) / 4;
		mlib_channel_separate((mlib_d64 *)cmyk + size / 2, buff_arr, m);

		m = (num / 4) & ~1;
		py = (mlib_f32 *)y + size / 4;
		pcb = (mlib_f32 *)cb + size / 4;
		pcr = (mlib_f32 *)cr + size / 4;
		pk = (mlib_f32 *)k + size / 4;
		buff = buff_arr;
#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			sdh = buff[0];
			sdl = buff[1];
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k11, k12, k13, off255, py[0]);
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k21, k22, k23, off128,
				pcb[0]);
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k31, k32, k33, off128,
				pcr[0]);
			py++;
			pcb++;
			pcr++;
			(*pk++) = vis_read_lo(sdl);
			buff += 2;
		}
	}

	if (n & 7) {
		mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF;
		mlib_d64 rbuff[4];
		mlib_f32 *prbuff = (mlib_f32 *)rbuff;

		sdh = (*buff++);
		sdl = (*buff++);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k11, k12, k13, off255, prbuff[0]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k21, k22, k23, off128, prbuff[2]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k31, k32, k33, off128, prbuff[4]);
		prbuff[6] = vis_read_lo(sdl);
		sdh = (*buff++);
		sdl = (*buff++);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k11, k12, k13, off255, prbuff[1]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k21, k22, k23, off128, prbuff[3]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k31, k32, k33, off128, prbuff[5]);
		prbuff[7] = vis_read_lo(sdl);

		vis_pst_8(rbuff[0], py, emask);
		vis_pst_8(rbuff[1], pcb, emask);
		vis_pst_8(rbuff[2], pcr, emask);
		vis_pst_8(rbuff[3], pk, emask);
	}
Example #18
0
/* The case of even address of vector x */
static void
mlib_VectorDotProd_U8C_al_x(
	mlib_d64 *z,
	const void *x,
	const void *y,
	mlib_s32 n)
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y;
	mlib_d64 sum_r = 0.0, sum_i = 0.0;
	mlib_d64 *dpx, *dpy, *dpxend;
	mlib_d64 dx, dy, dy0, dy1;
	mlib_d64 dx_r, dy_r, dy_i;
	mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl;
	mlib_d64 d_ih, d_il, d_rh, d_rl;
	mlib_d64 ds_r, ds_i, ds1_r, ds1_i;
	mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 edge[2], fzero = vis_fzero();
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask, off;

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	off = (mlib_addr)dpx - (mlib_addr)px;
	dpy = vis_alignaddr((void *)py, off);
	pxend = px + n + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	dy = vis_ld_d64_nf(dpy);

	if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) {
		vis_write_bmask(0x781A3C5E, 0);
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C0;
				SUM_U8C;
				dx = dpx[1];
				dy = dpy[1];
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	} else {
		mlib_s32 mask = ((mlib_addr)(py + off)) & 7;

		vis_write_bmask(0x11111111 * mask, 0x01234567);
		dy1 = vis_ld_d64_nf(dpy+1);
		dy = vis_bshuffle(dy, dy1);
		SET_ALIGN_U8C;
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy+2);
				dx = vis_ld_d64_nf(dpx+1);
				dy = vis_bshuffle(dy0, dy1);
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		SET_ALIGN_U8C;
		DPROD_U8C;
		SUM_U8C_TAIL;
		fsum = vis_read_hi(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = sum_r;
	z[1] = sum_i;
#undef MAX_LOOP
}
Example #19
0
mlib_status
__mlib_VectorSub_U8_U8_Mod(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *y,
	mlib_s32 n)
{
/* 8-byte aligned start point in destination */
	mlib_d64 *dpz;

/* 8-byte aligned start point in source */
	mlib_d64 *dpx, *dpy;

/* source  data */
	mlib_d64 dx, dy, dx0;
	mlib_d64 dx1, dy0, dy1;

/* destination data */
	mlib_d64 dz;

/* intermediate result */
	mlib_d64 dh, dl;
	mlib_d64 dxl, dyl;

/* end point of a line in destination */
	mlib_u8 *pzend;

/* start point of a line in source */
	mlib_u8 *px, *py;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge mask */
	mlib_s32 emask;
	mlib_u8 *pzend16;
	mlib_s32 sr1, sr2, sr3;
	mlib_s32 x8, x12, y8, y12;
	mlib_s32 mask = 0x7f7f7f7f;
	mlib_u8 *pz;
	mlib_s32 n16;
	mlib_s32 nrest;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_d64 mask_control = vis_to_double_dup(0xff00ff00);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_u8 *)x;
	py = (mlib_u8 *)y;
	pz = (mlib_u8 *)z;

/*
 * prepare the destination address
 */
	pzend = pz + n - 1;

/*
 * check for 64-bit aligned special case
 */

	if ((((mlib_addr)x | (mlib_addr)y | (mlib_addr)z) & 7) == 0) {

/*
 * We can process source and destination vectors by 16 bytes.
 */

		dpx = (mlib_d64 *)x;
		dx = vis_ld_d64_nf(dpx);
		dpy = (mlib_d64 *)y;
		dy = vis_ld_d64_nf(dpy);
		dpz = (mlib_d64 *)z;
		n16 = n & (~0xf);
		pzend16 = pz + n16;
#pragma pipeloop(0)
		while ((mlib_addr)pz < (mlib_addr)pzend16) {
			x8 = *((mlib_s32 *)(px + 8));
			y8 = *((mlib_s32 *)(py + 8));
			sr1 = x8 ^ ~y8;
			sr2 = (x8 | ~mask) - (y8 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 8)) = sr3;
			x12 = *((mlib_s32 *)(px + 12));
			y12 = *((mlib_s32 *)(py + 12));
			sr1 = x12 ^ ~y12;
			sr2 = (x12 | ~mask) - (y12 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 12)) = sr3;
			SUB_S8_MOD;
/* store 8 bytes of result */
			*((mlib_d64 *)pz) = dz;
			dx = vis_ld_d64_nf(px + 16);
			dy = vis_ld_d64_nf(py + 16);
			px += 16;
			py += 16;
			pz += 16;
		}

		dpz = (mlib_d64 *)pzend16;
		nrest = n - n16;

		if (nrest >= 8) {
			SUB_S8_MOD;
			dpz[0] = dz;
			px += 8;
			py += 8;
			dpz++;
			nrest -= 8;
		}

		if (nrest > 0) {
			dx = *((mlib_d64 *)px);
			dy = *((mlib_d64 *)py);
			SUB_S8_MOD;
			emask = vis_edge8(dpz, pzend);
			vis_pst_8(dz, dpz, emask);
		}
	} else {

/*
 * General case.
 */

		dpz = (mlib_d64 *)((mlib_addr)z & (~7));
		off = (mlib_addr)dpz - (mlib_addr)z;
/*
 * generate edge mask for the start point
 */
		emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

		if (off) {
			dpy = (mlib_d64 *)vis_alignaddr(py, off);
			dy0 = vis_ld_d64_nf(dpy);
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			dpx = (mlib_d64 *)vis_alignaddr(px, off);
			dx0 = vis_ld_d64_nf(dpx);
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_MOD;

/*
 * store first bytes of result
 */

			vis_pst_8(dz, dpz, emask);

			px += (8 + off);
			py += (8 + off);
			len -= (8 + off);
			dpz++;

			if (len <= 0)
				return (MLIB_SUCCESS);
		}

		even_8 = len >> 3;
		rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

		if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

			dpx = (mlib_d64 *)px;
			dpy = (mlib_d64 *)py;

			dx = vis_ld_d64_nf(dpx);
			dpx++;
			dy = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dx1 = vis_ld_d64_nf(dpx);
				dy1 = vis_ld_d64_nf(dpy);
				SUB_S8_MOD;
				dx = dx1;
				dy = dy1;
/*
 * store 8 bytes of result
 */
				dpz[0] = dz;
				dpx++;
				dpy++;
				dpz++;
			}

			dx1 = dx;
			dy1 = dy;
		} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

			dpx = (mlib_d64 *)px;
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);
			dx = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				dx = vis_ld_d64_nf(dpx);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpx++;
				dpy++;
			}

			dx1 = dx;
			dy1 = dy0;
		} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

			dpy = (mlib_d64 *)py;
			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy = (*dpy++);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}

			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
		} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dpy = vis_alignaddr(py, 0);
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy);
				dpy++;
				dy = vis_faligndata(dy0, dy1);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}
		} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx0 = vis_ld_d64_nf(dpx);
			dpx++;
			dx1 = vis_ld_d64_nf(dpx);
			dx = vis_faligndata(dx0, dx1);
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				vis_alignaddr(py, 0);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
				vis_alignaddr(px, 0);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx + 1);
				dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpy++;
				dpx++;
			}

			dx1 = dx0;
			dy1 = dy0;
		}

		if (!rest_8)
			return (MLIB_SUCCESS);

		vis_alignaddr(px, 0);
		dx0 = dx1;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		vis_alignaddr(py, 0);
		dy0 = dy1;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		SUB_S8_MOD;

/*
 * prepare edge mask for the last bytes
 */

		emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
		vis_pst_8(dz, dpz, ~emask);
	}

	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
void mlib_v_ImageLookUpSI_U16_U8_2_D1(const mlib_u16 *src,
                                      mlib_u8        *dst,
                                      mlib_s32       xsize,
                                      const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3, s4;         /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = dst;

  dend = dl + 2 * xsize - 1;

  vis_alignaddr((void *)0, 7);

  s0 = *sp++;
  *dl++ = tab0[s0];
  dp = (mlib_d64 *) dl;
  xsize--;

  if (xsize >= 4) {

    s1 = sp[0];
    s2 = sp[1];
    s3 = sp[2];
    s4 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab0, s4);
      t6 = VIS_LD_U8_I(tab1, s3);
      t5 = VIS_LD_U8_I(tab0, s3);
      t4 = VIS_LD_U8_I(tab1, s2);
      t3 = VIS_LD_U8_I(tab0, s2);
      t2 = VIS_LD_U8_I(tab1, s1);
      t1 = VIS_LD_U8_I(tab0, s1);
      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = s4;
      s1 = sp[0];
      s2 = sp[1];
      s3 = sp[2];
      s4 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab0, s4);
    t6 = VIS_LD_U8_I(tab1, s3);
    t5 = VIS_LD_U8_I(tab0, s3);
    t4 = VIS_LD_U8_I(tab1, s2);
    t3 = VIS_LD_U8_I(tab0, s2);
    t2 = VIS_LD_U8_I(tab1, s1);
    t1 = VIS_LD_U8_I(tab0, s1);
    t0 = VIS_LD_U8_I(tab1, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    s0 = s4;
    *dp++ = acc;
  }

  num = ((mlib_u8 *) dend - (mlib_u8 *) dp) >> 1;
  sp += num;
  num++;

#pragma pipeloop(0)
  for (i = 0; i < num; i++) {
    s1 = (mlib_s32) * sp;
    sp--;

    t0 = VIS_LD_U8_I(tab1, s1);
    acc = vis_faligndata(t0, acc);

    t0 = VIS_LD_U8_I(tab0, s1);
    acc = vis_faligndata(t0, acc);
  }

  t0 = VIS_LD_U8_I(tab1, s0);
  acc = vis_faligndata(t0, acc);
  emask = vis_edge8(dp, dend);
  vis_pst_8(acc, dp, emask);
}
void mlib_v_ImageLookUp_S16_U8_3_D1(const mlib_s16 *src,
                                    mlib_u8        *dst,
                                    mlib_s32       xsize,
                                    const mlib_u8  *table0,
                                    const mlib_u8  *table1,
                                    const mlib_u8  *table2)
{
  mlib_s16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_s32 s4, s5, s6, s7;             /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *table;

  dl = dst;
  sp = (void *)src;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 8) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    s4 = sp[4];
    s5 = sp[5];
    s6 = sp[6];
    s7 = sp[7];
    sp += 8;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
      t7 = VIS_LD_U8_I(table1, s7);
      t6 = VIS_LD_U8_I(table0, s6);
      t5 = VIS_LD_U8_I(table2, s5);
      t4 = VIS_LD_U8_I(table1, s4);
      t3 = VIS_LD_U8_I(table0, s3);
      t2 = VIS_LD_U8_I(table2, s2);
      t1 = VIS_LD_U8_I(table1, s1);
      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      table = table0;
      table0 = table2;
      table2 = table1;
      table1 = table;
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      s4 = sp[4];
      s5 = sp[5];
      s6 = sp[6];
      s7 = sp[7];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(table1, s7);
    t6 = VIS_LD_U8_I(table0, s6);
    t5 = VIS_LD_U8_I(table2, s5);
    t4 = VIS_LD_U8_I(table1, s4);
    t3 = VIS_LD_U8_I(table0, s3);
    t2 = VIS_LD_U8_I(table2, s2);
    t1 = VIS_LD_U8_I(table1, s1);
    t0 = VIS_LD_U8_I(table0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    table = table0;
    table0 = table2;
    table2 = table1;
    table1 = table;
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_addr) dend - (mlib_addr) dp;
    sp += num;
    num++;
    i = num - 3 * (num / 3);

    if (i == 2) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num -= 2;
    }
    else if (i == 1) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num--;
    }

#pragma pipeloop(0)
    for (i = 0; i < num; i += 3) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table2, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge8(dp, dend);
    vis_pst_8(acc, dp, emask);
  }
}
Example #23
0
mlib_status
__mlib_VectorConjRev_S8C_S8C_Sat(
	mlib_s8 *zz,
	const mlib_s8 *xx,
	mlib_s32 n)
{
	const mlib_s8 *x = xx;
	mlib_s8 *z = zz;
	mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n);
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dcntr0 = vis_to_double_dup(0x00800080);
	mlib_d64 dxor0 = vis_to_double_dup(0x007f007f);
	mlib_d64 done = vis_to_double_dup(1);
	mlib_s8 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;
	mlib_f32 f_null = vis_to_float(0);

	CHECK(x, z);

	if (n < 8) {
		CONJREVC(mlib_s8,
			MLIB_S8_MAX,
			MLIB_S8_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S8_MIN)
			*--dst = MLIB_S8_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(7 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 1);
			d_rest = vis_to_double((re_part << 24), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_8(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}

			vis_write_gsr(1);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_8(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S8_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S8_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
Example #24
0
mlib_status
mlib_v_ImageAdd_U8(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_d64 sd1h, sd2h, sd1l, sd2l, rdh, rdl;
	mlib_u8 *dend;
	mlib_f32 nul = vis_to_float(0), fone = vis_to_float(0x100);

	VALIDATE(mlib_u8);

/* initialize GSR scale factor */
	vis_write_gsr(7 << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 7) == 0) &&
	    (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xff) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_U8(sd10, sd20, dd0);

			if (emask != 0xff) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 8); k += 8) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd11, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDCT8x8Quantize_S16_S16_B12_NA(
    mlib_s16 coeffs[64],
    const mlib_s16 *block,
    const mlib_d64 qtable[64])
{
    mlib_d64 *sp = (mlib_d64 *)block;
    mlib_d64 *dp = (mlib_d64 *)coeffs;

    mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70;
    mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71;
    mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90;
    mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91;
    mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70;
    mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71;
    mlib_f32 FCOS, c17, c26, c35, c_4;
    mlib_s32 mask;
    mlib_d64 w_const = vis_to_double_dup(0x4000);

    if (block == NULL || coeffs == NULL)
        return (MLIB_FAILURE);

    if (!(((mlib_addr)block | (mlib_addr)coeffs) & 7)) {
        return (__mlib_VideoDCT8x8Quantize_S16_S16_B12(coeffs,
                block, qtable));
    }

    vis_write_gsr(1 << 3);
    /*
     * first stage
     */

    LOAD_DATA_GE_INTER1;

    TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30);
    TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70);
    LOADCONSTS4_12;

    PREPARE_DATA_INTER(0);

    LOAD_DATA_GE_INTER2;
    TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31);

    COMPUTING_DATA(0);

    TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71);
    PREPARE_DATA_INTER(1);
    COMPUTING_DATA(1);

    /*
     * second stage
     */


    TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70);
    TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30);
    PREPARE_DATA_INTER(0);
    TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31);
    COMPUTING_DATA_12(0);

    TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71);
    ENDSCALE_12(0);


    dp = (mlib_d64 *)vis_alignaddr(coeffs, -1);
    mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp);
    vis_alignaddrl((void *)coeffs, 0);

    PREPARE_DATA_INTER(1);
    COMPUTING_DATA_12(1);

    ENDSCALE_12(1);

    Quant_ST_NA(d00, d00, qtable[0]);
    Quant_ST_NA(d01, d01, qtable[1]);
    Quant_ST_NA(d10, d10, qtable[2]);
    Quant_ST_NA(d11, d11, qtable[3]);
    Quant_ST_NA(d20, d20, qtable[4]);
    Quant_ST_NA(d21, d21, qtable[5]);
    Quant_ST_NA(d30, d30, qtable[6]);
    Quant_ST_NA(d31, d31, qtable[7]);
    Quant_ST_NA(d40, d40, qtable[8]);
    Quant_ST_NA(d41, d41, qtable[9]);
    Quant_ST_NA(d50, d50, qtable[10]);
    Quant_ST_NA(d51, d51, qtable[11]);
    Quant_ST_NA(d60, d60, qtable[12]);
    Quant_ST_NA(d61, d61, qtable[13]);
    Quant_ST_NA(d70, d70, qtable[14]);
    Quant_ST_NA(d71, d71, qtable[15]);

    dp[1] = vis_faligndata(d00, d01);
    dp[2] = vis_faligndata(d01, d10);
    dp[3] = vis_faligndata(d10, d11);
    dp[4] = vis_faligndata(d11, d20);
    dp[5] = vis_faligndata(d20, d21);
    dp[6] = vis_faligndata(d21, d30);
    dp[7] = vis_faligndata(d30, d31);
    dp[8] = vis_faligndata(d31, d40);
    dp[9] = vis_faligndata(d40, d41);
    dp[10] = vis_faligndata(d41, d50);
    dp[11] = vis_faligndata(d50, d51);
    dp[12] = vis_faligndata(d51, d60);
    dp[13] = vis_faligndata(d60, d61);
    dp[14] = vis_faligndata(d61, d70);
    dp[15] = vis_faligndata(d70, d71);
    vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask);

    if ((mlib_addr)coeffs & 7)
        vis_pst_8(vis_faligndata(d00, d00), dp, mask);

    return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
Example #28
0
void
mlib_v_ImageSqrShift_U8(
    mlib_u8 *src,
    mlib_s32 slb,
    mlib_u8 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize)
{
/* pointer to a line in source image */
	mlib_u8 *sl;

/* 8-byte aligned pointer to source image */
	mlib_d64 *sp;

/* pointer to a line in destination image */
	mlib_u8 *dl;

/* pointer to end of a line in destination image */
	mlib_u8 *dend;

/* 8-byte aligned pointer to destination image */
	mlib_d64 *dp;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 sdh, sdl;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, j, n;

	sl = src;
	dl = dst;

/* row loop */
	for (j = 0; j < ysize; j++) {

/* prepare the destination address */
		dp = (mlib_d64 *)((mlib_addr)dl & (~7));
		off = (mlib_addr)dp - (mlib_addr)dl;
		dend = dl + xsize - 1;

/* prepare the source address */
		sp = (mlib_d64 *)vis_alignaddr(sl, off);

/* generate edge mask for the start point */
		emask = vis_edge8(dl, dend);

/* first 8 pixels */
		s0 = vis_ld_d64_nf(sp); sp++;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
		vis_pst_8(dd, dp++, emask);

		n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 8-pixel column loop */
#pragma pipeloop(0)
		for (i = 0; i < n; i++) {
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
			(*dp++) = dd;
		}

/* end point handling */

		if ((mlib_addr)dp <= (mlib_addr)dend) {
			emask = vis_edge8(dp, dend);
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
			vis_pst_8(dd, dp++, emask);
		}

		sl += slb;
		dl += dlb;
	}
}
Example #29
0
mlib_status
__mlib_VectorNorm_S8_Sat(
	mlib_d64 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *pxend, *px = (mlib_s8 *)x;
	mlib_d64 *dpx, *dpxend;
	mlib_d64 sum = 0.0;
	mlib_d64 dx, dr1, dr2, dr3, dr4, dr5, dr6, dr7, dr8;
	mlib_d64 ds1, ds2;
	mlib_d64 edge[2];
	mlib_d64 fzero = vis_fzero();
	mlib_f32 f4ones = vis_to_float(0x01010101);
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	edge[0] = edge[1] = 0;
	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	pxend = px + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];

	while ((mlib_addr)dpx < (mlib_addr)dpxend) {
		d_left = dpxend - dpx;

		if (d_left > MAX_LOOP)
			d_left = MAX_LOOP;
		ds1 = ds2 = 0.0;
		for (; d_left > 0; d_left--) {
			NORM_S8;
			SUM_S8;
			dpx++;
			dx = dpx[0];
		}

		fsum = vis_read_hi(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		NORM_S8;
		ds1 = vis_fpadd32(dr5, dr6);
		ds2 = vis_fpadd32(dr7, dr8);
		fsum = vis_read_hi(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = mlib_sqrt(sum / 256.0);
	return (MLIB_SUCCESS);
#undef MAX_LOOP
}
Example #30
0
mlib_status
__mlib_VectorSubS_U8_U8_Sat(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *c,
	mlib_s32 n)
{
/* edge masks */
	mlib_s32 emask;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_u8 *pzend;
	mlib_d64 *dpx, *dpz, *dpzend;
	mlib_d64 dx, dx0, dx1, dr0, dr1, dr;
	mlib_u16 cc = *((mlib_u8 *)c);

/* prepare the scaling factors */
	mlib_d64 dc = vis_to_double_dup((cc << 4) | (cc << 20));

	if (n <= 0)
		return (MLIB_FAILURE);

/* initialize GSR scale factor */
	vis_write_gsr(3 << 3);

	pzend = (mlib_u8 *)z + n - 1;
	dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7));
	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;

/*
 * prepare the source address
 */
	dpx = (mlib_d64 *)vis_alignaddr((void *)x, off);
	dx0 = vis_ld_d64_nf(dpx);
	dpx++;
	dx1 = vis_ld_d64_nf(dpx);
	dpx++;

/*
 * generate edge mask for the start bytes
 */
	emask = vis_edge8(z, pzend);
	dx = vis_faligndata(dx0, dx1);
	SUBS_U8_SAT;
/* store first bytes of result */
	vis_pst_8(dr, dpz, emask);
	dpz++;
	dx0 = dx1;

#pragma pipeloop(0)
	for (; (mlib_addr)dpz < (mlib_addr)dpzend; ) {
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dx = vis_faligndata(dx0, dx1);
		SUBS_U8_SAT;
		(*dpz++) = dr;
		dx0 = dx1;
	}

	if ((mlib_addr)dpz <= (mlib_addr)pzend) {
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dx = vis_faligndata(dx0, dx1);
		SUBS_U8_SAT;
/* prepare edge mask for the last bytes */
		emask = vis_edge8(dpz, pzend);
/* store last bytes of result */
		vis_pst_8(dr, dpz, emask);
	}

	return (MLIB_SUCCESS);
}