Esempio n. 1
0
void
mlib_v_ImageNot_na(
    mlib_u8 *sa,
    mlib_u8 *da,
    mlib_s32 size)
{
/* end points in dst */
	mlib_u8 *dend;

/* 8-byte aligned start points in dst */
	mlib_d64 *dp;

/* 8-byte aligned start point in src */
	mlib_d64 *sp;

/* 8-byte source data */
	mlib_d64 s0, s1;

/* offset of address in dst */
	mlib_s32 j;

/* edge mask */
	mlib_s32 emask;

/* prepare the destination addresses */
	dp = (mlib_d64 *)((mlib_addr)da & (~7));
	j = (mlib_addr)dp - (mlib_addr)da;
	dend = da + size - 1;

/* prepare the source address */
	sp = (mlib_d64 *)vis_alignaddr(sa, j);
/* generate edge mask for the start point */
	emask = vis_edge8(da, dend);

	s1 = vis_ld_d64_nf(sp);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		s0 = vis_faligndata(s0, s1);
		vis_pst_8(vis_fnot(s0), dp++, emask);
		sp++;
		j += 8;
	}
#pragma pipeloop(0)
	for (; j <= (size - 8); j += 8) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		(*dp++) = vis_fnot(vis_faligndata(s0, s1));
		sp++;
	}

	if (j < size) {
		s0 = vis_faligndata(s1, vis_ld_d64_nf(sp + 1));
		emask = vis_edge8(dp, dend);
		vis_pst_8(vis_fnot(s0), dp, emask);
	}
}
Esempio n. 2
0
void
mlib_ImageLineXor8000(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_d64 *dptr;
	mlib_d64 *sptr;
	mlib_d64 s0, s1;
	mlib_d64 mask8000 = vis_to_double_dup(0x80008000);
	mlib_s32 j;
	mlib_s32 emask;

/* prepare the destination addresses */
	dptr = (mlib_d64 *)((mlib_addr)dst & (~7));
	j = (mlib_addr)dptr - (mlib_addr)dst;
	dend = (mlib_u8 *)dst + size - 1;

/* prepare the source address */
	sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j);
/* generate edge mask for the start point */
	emask = vis_edge8(dst, dend);

	s1 = vis_ld_d64_nf(sptr);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		s0 = vis_fxor(vis_faligndata(s0, s1), mask8000);
		vis_pst_8(s0, dptr++, emask);
		sptr++;
		j += 8;
	}

#pragma pipeloop(0)
	for (; j <= (size - 16); j += 8) {
		s0 = s1;
		s1 = sptr[1];
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
	}

	if (j <= (size - 8)) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
		j += 8;
	}

	if (j < size) {
		s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)),
		    mask8000);
		emask = vis_edge8(dptr, dend);
		vis_pst_8(s0, dptr, emask);
	}
}
Esempio n. 3
0
mlib_status
__mlib_VideoUpSample420_Nearest(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)src;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 sa, da;
	mlib_s32 emask, i;

	if (n <= 0)
		return (MLIB_FAILURE);

#pragma pipeloop(0)
	for (i = 0; i <= (n - 8); i += 8) {
		sa = *sp;
		*dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		*(dp0 + 1) = *(dp1 + 1) =
			vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
		sp++;
		dp0 += 2;
		dp1 += 2;
	}

	if (i < n) {
		sa = vis_ld_d64_nf(sp);
		da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa));
		emask = vis_edge8(dp0, dend0);
		vis_pst_8(da, dp0, emask);
		vis_pst_8(da, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa));
			emask = vis_edge8(dp0, dend0);
			vis_pst_8(da, dp0, emask);
			vis_pst_8(da, dp1, emask);
		}
	}

	return (MLIB_SUCCESS);
}
Esempio n. 4
0
mlib_status
mlib_v_conv3x3_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2, d21, d22;

/* data */
	mlib_d64 d3, d_3, d23;
	mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1, sd00;
	mlib_s32 emask, cmask1;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	cmask = ((cmask & 0xf) << 4) + (cmask & 0xf);
	cmask = (cmask << 8) + (cmask);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 2;
	dw *= 4;
	dh -= 2;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	d_a = adr_dst + dlb + 4;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf3, sa2);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(8);

		vis_alignaddr(s1, 4);
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k3k4);
			CONV_AU(d3, k7k8);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			d23 = vis_faligndata(d3, d_3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k5k6);
			CONV_AL(d23, k7k8);
			CONV_AU(d_1, k3k4);
			CONV_AL(d_2, k5k6);
			CONV_AU(d_3, k9k9);
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = dbuf;
/* prepare the destination addresses */
		dp = (mlib_d64 *)((mlib_addr)da & (~7));
		i = (mlib_addr)dp - (mlib_addr)da;
		cmask1 = cmask >> (-i);
		ddst = vis_alignaddr(ddst, i);
/* generate edge mask for the start point */
		emask = vis_edge8(da, dend);
		sd1 = ddst[0];

		if (emask != 0xff) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd0 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd0, dp++, emask & cmask1);
			ddst++;
			i += 8;
		}
#pragma pipeloop(0)
		for (; i <= (dw - 8); i += 8) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd00 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd00, dp++, cmask1);
			ddst++;
		}

		if (i < dw) {
			sd0 = vis_faligndata(sd1, ddst[1]);
			emask = vis_edge8(dp, dend);
			vis_pst_8(sd0, dp, emask & cmask1);
		}

		sa2 = sa2 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
Esempio n. 5
0
static void
mlib_VectorDotProd_U8C_al_x(
	mlib_d64 *z,
	const void *x,
	const void *y,
	mlib_s32 n)
/* The case of even address of vector x */
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y;
	mlib_d64 sum_r = 0.0, sum_i = 0.0;
	mlib_d64 *dpx, *dpy, *dpxend;
	mlib_d64 dx, dy, dy0, dy1;
	mlib_d64 dx_r, dy_r, dy_i;
	mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl;
	mlib_d64 d_ih, d_il, d_rh, d_rl;
	mlib_d64 ds_r, ds_i, ds1_r, ds1_i;
	mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 edge[2];
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask, off;
	mlib_d64 done = vis_to_double_dup(0x1000100);

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	off = (mlib_addr)dpx - (mlib_addr)px;
	dpy = vis_alignaddr((void *)py, off);
	pxend = px + n + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	dy = vis_ld_d64_nf(dpy);

	if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) {
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dx = dpx[1];
				dy = dpy[1];
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}

	} else {
		dy1 = vis_ld_d64_nf(dpy+1);
		dy = vis_faligndata(dy, dy1);
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy+2);
				dx = vis_ld_d64_nf(dpx+1);
				dy = vis_faligndata(dy0, dy1);
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		DPROD_U8C;
		SUM_U8C_TAIL;
		fsum = vis_read_hi(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = sum_r;
	z[1] = sum_i;
#undef MAX_LOOP
}
Esempio n. 6
0
  src = da[0];
  da[0] = (src & (~mask)) | (sa[0] & mask);
  da++;
  sa++;
  size = size - 8 + offset;
  b_size = size >> 3;                       /* size in bytes */

  /* prepare the destination addresses */
  dp = (mlib_d64 *) ((mlib_addr) da & (~7));
  j = (mlib_addr) dp - (mlib_addr) da;
  dend = da + b_size - 1;

  /* prepare the source address */
  sp = (mlib_d64 *) VIS_ALIGNADDR(sa, j);
  /* generate edge mask for the start point */
  emask = vis_edge8(da, dend);

  s1 = vis_ld_d64_nf(sp);
  if (emask != 0xff) {
    s0 = s1;
    s1 = vis_ld_d64_nf(sp+1);
    s0 = vis_faligndata(s0, s1);
    vis_pst_8(s0, dp++, emask);
    sp++;
    j += 8;
  }

#pragma pipeloop(0)
  for (; j <= (b_size - 8); j += 8) {
    s0 = s1;
    s1 = vis_ld_d64_nf(sp+1);
Esempio n. 7
0
mlib_status
__mlib_VectorNorm_S8_Sat(
	mlib_d64 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *pxend, *px = (mlib_s8 *)x;
	mlib_d64 *dpx, *dpxend;
	mlib_d64 sum = 0.0;
	mlib_d64 dx, dr1, dr2, dr3, dr4, dr5, dr6, dr7, dr8;
	mlib_d64 ds1, ds2;
	mlib_d64 edge[2];
	mlib_d64 fzero = vis_fzero();
	mlib_f32 f4ones = vis_to_float(0x01010101);
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	edge[0] = edge[1] = 0;
	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	pxend = px + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];

	while ((mlib_addr)dpx < (mlib_addr)dpxend) {
		d_left = dpxend - dpx;

		if (d_left > MAX_LOOP)
			d_left = MAX_LOOP;
		ds1 = ds2 = 0.0;
		for (; d_left > 0; d_left--) {
			NORM_S8;
			SUM_S8;
			dpx++;
			dx = dpx[0];
		}

		fsum = vis_read_hi(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		NORM_S8;
		ds1 = vis_fpadd32(dr5, dr6);
		ds2 = vis_fpadd32(dr7, dr8);
		fsum = vis_read_hi(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds1);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds2);
		sum += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = mlib_sqrt(sum / 256.0);
	return (MLIB_SUCCESS);
#undef MAX_LOOP
}
Esempio n. 8
0
mlib_status
__mlib_VectorSubS_U8_U8_Sat(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *c,
	mlib_s32 n)
{
/* edge masks */
	mlib_s32 emask;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_u8 *pzend;
	mlib_d64 *dpx, *dpz, *dpzend;
	mlib_d64 dx, dx0, dx1, dr0, dr1, dr;
	mlib_u16 cc = *((mlib_u8 *)c);

/* prepare the scaling factors */
	mlib_d64 dc = vis_to_double_dup((cc << 4) | (cc << 20));

	if (n <= 0)
		return (MLIB_FAILURE);

/* initialize GSR scale factor */
	vis_write_gsr(3 << 3);

	pzend = (mlib_u8 *)z + n - 1;
	dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7));
	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;

/*
 * prepare the source address
 */
	dpx = (mlib_d64 *)vis_alignaddr((void *)x, off);
	dx0 = vis_ld_d64_nf(dpx);
	dpx++;
	dx1 = vis_ld_d64_nf(dpx);
	dpx++;

/*
 * generate edge mask for the start bytes
 */
	emask = vis_edge8(z, pzend);
	dx = vis_faligndata(dx0, dx1);
	SUBS_U8_SAT;
/* store first bytes of result */
	vis_pst_8(dr, dpz, emask);
	dpz++;
	dx0 = dx1;

#pragma pipeloop(0)
	for (; (mlib_addr)dpz < (mlib_addr)dpzend; ) {
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dx = vis_faligndata(dx0, dx1);
		SUBS_U8_SAT;
		(*dpz++) = dr;
		dx0 = dx1;
	}

	if ((mlib_addr)dpz <= (mlib_addr)pzend) {
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dx = vis_faligndata(dx0, dx1);
		SUBS_U8_SAT;
/* prepare edge mask for the last bytes */
		emask = vis_edge8(dpz, pzend);
/* store last bytes of result */
		vis_pst_8(dr, dpz, emask);
	}

	return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUp_S16_U8_124_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 *table0,
    const mlib_u8 *table1,
    const mlib_u8 *table2,
    const mlib_u8 *table3)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* source data */
	mlib_s32 s4, s5, s6, s7;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;

/* destination data */
	mlib_d64 acc1;

	dl = dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;
	sp = (void *)src;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 8) {

		s0 = sp[0];
		s1 = sp[1];
		s2 = sp[2];
		s3 = sp[3];
		s4 = sp[4];
		s5 = sp[5];
		s6 = sp[6];
		s7 = sp[7];
		sp += 8;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
			t7 = VIS_LD_U8_I(table3, s7);
			t6 = VIS_LD_U8_I(table2, s6);
			t5 = VIS_LD_U8_I(table1, s5);
			t4 = VIS_LD_U8_I(table0, s4);
			t3 = VIS_LD_U8_I(table3, s3);
			t2 = VIS_LD_U8_I(table2, s2);
			t1 = VIS_LD_U8_I(table1, s1);
			t0 = VIS_LD_U8_I(table0, s0);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0];
			s1 = sp[1];
			s2 = sp[2];
			s3 = sp[3];
			s4 = sp[4];
			s5 = sp[5];
			s6 = sp[6];
			s7 = sp[7];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(table3, s7);
		t6 = VIS_LD_U8_I(table2, s6);
		t5 = VIS_LD_U8_I(table1, s5);
		t4 = VIS_LD_U8_I(table0, s4);
		t3 = VIS_LD_U8_I(table3, s3);
		t2 = VIS_LD_U8_I(table2, s2);
		t1 = VIS_LD_U8_I(table1, s1);
		t0 = VIS_LD_U8_I(table0, s0);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_addr)dend - (mlib_addr)dp;
		sp += num;
		num++;

		if ((num & 3) == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num--;
		} else if ((num & 3) == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 2;
		} else if ((num & 3) == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 3;
		}

		if (num != 0) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table3, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge8(dp, dend);
		vis_pst_8(acc0, dp, emask);
	}
}
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
Esempio n. 14
0
mlib_status
__mlib_VectorSub_U8_U8_Mod(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *y,
	mlib_s32 n)
{
/* 8-byte aligned start point in destination */
	mlib_d64 *dpz;

/* 8-byte aligned start point in source */
	mlib_d64 *dpx, *dpy;

/* source  data */
	mlib_d64 dx, dy, dx0;
	mlib_d64 dx1, dy0, dy1;

/* destination data */
	mlib_d64 dz;

/* intermediate result */
	mlib_d64 dh, dl;
	mlib_d64 dxl, dyl;

/* end point of a line in destination */
	mlib_u8 *pzend;

/* start point of a line in source */
	mlib_u8 *px, *py;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge mask */
	mlib_s32 emask;
	mlib_u8 *pzend16;
	mlib_s32 sr1, sr2, sr3;
	mlib_s32 x8, x12, y8, y12;
	mlib_s32 mask = 0x7f7f7f7f;
	mlib_u8 *pz;
	mlib_s32 n16;
	mlib_s32 nrest;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_d64 mask_control = vis_to_double_dup(0xff00ff00);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_u8 *)x;
	py = (mlib_u8 *)y;
	pz = (mlib_u8 *)z;

/*
 * prepare the destination address
 */
	pzend = pz + n - 1;

/*
 * check for 64-bit aligned special case
 */

	if ((((mlib_addr)x | (mlib_addr)y | (mlib_addr)z) & 7) == 0) {

/*
 * We can process source and destination vectors by 16 bytes.
 */

		dpx = (mlib_d64 *)x;
		dx = vis_ld_d64_nf(dpx);
		dpy = (mlib_d64 *)y;
		dy = vis_ld_d64_nf(dpy);
		dpz = (mlib_d64 *)z;
		n16 = n & (~0xf);
		pzend16 = pz + n16;
#pragma pipeloop(0)
		while ((mlib_addr)pz < (mlib_addr)pzend16) {
			x8 = *((mlib_s32 *)(px + 8));
			y8 = *((mlib_s32 *)(py + 8));
			sr1 = x8 ^ ~y8;
			sr2 = (x8 | ~mask) - (y8 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 8)) = sr3;
			x12 = *((mlib_s32 *)(px + 12));
			y12 = *((mlib_s32 *)(py + 12));
			sr1 = x12 ^ ~y12;
			sr2 = (x12 | ~mask) - (y12 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 12)) = sr3;
			SUB_S8_MOD;
/* store 8 bytes of result */
			*((mlib_d64 *)pz) = dz;
			dx = vis_ld_d64_nf(px + 16);
			dy = vis_ld_d64_nf(py + 16);
			px += 16;
			py += 16;
			pz += 16;
		}

		dpz = (mlib_d64 *)pzend16;
		nrest = n - n16;

		if (nrest >= 8) {
			SUB_S8_MOD;
			dpz[0] = dz;
			px += 8;
			py += 8;
			dpz++;
			nrest -= 8;
		}

		if (nrest > 0) {
			dx = *((mlib_d64 *)px);
			dy = *((mlib_d64 *)py);
			SUB_S8_MOD;
			emask = vis_edge8(dpz, pzend);
			vis_pst_8(dz, dpz, emask);
		}
	} else {

/*
 * General case.
 */

		dpz = (mlib_d64 *)((mlib_addr)z & (~7));
		off = (mlib_addr)dpz - (mlib_addr)z;
/*
 * generate edge mask for the start point
 */
		emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

		if (off) {
			dpy = (mlib_d64 *)vis_alignaddr(py, off);
			dy0 = vis_ld_d64_nf(dpy);
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			dpx = (mlib_d64 *)vis_alignaddr(px, off);
			dx0 = vis_ld_d64_nf(dpx);
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_MOD;

/*
 * store first bytes of result
 */

			vis_pst_8(dz, dpz, emask);

			px += (8 + off);
			py += (8 + off);
			len -= (8 + off);
			dpz++;

			if (len <= 0)
				return (MLIB_SUCCESS);
		}

		even_8 = len >> 3;
		rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

		if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

			dpx = (mlib_d64 *)px;
			dpy = (mlib_d64 *)py;

			dx = vis_ld_d64_nf(dpx);
			dpx++;
			dy = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dx1 = vis_ld_d64_nf(dpx);
				dy1 = vis_ld_d64_nf(dpy);
				SUB_S8_MOD;
				dx = dx1;
				dy = dy1;
/*
 * store 8 bytes of result
 */
				dpz[0] = dz;
				dpx++;
				dpy++;
				dpz++;
			}

			dx1 = dx;
			dy1 = dy;
		} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

			dpx = (mlib_d64 *)px;
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);
			dx = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				dx = vis_ld_d64_nf(dpx);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpx++;
				dpy++;
			}

			dx1 = dx;
			dy1 = dy0;
		} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

			dpy = (mlib_d64 *)py;
			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy = (*dpy++);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}

			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
		} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dpy = vis_alignaddr(py, 0);
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy);
				dpy++;
				dy = vis_faligndata(dy0, dy1);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}
		} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx0 = vis_ld_d64_nf(dpx);
			dpx++;
			dx1 = vis_ld_d64_nf(dpx);
			dx = vis_faligndata(dx0, dx1);
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				vis_alignaddr(py, 0);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
				vis_alignaddr(px, 0);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx + 1);
				dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpy++;
				dpx++;
			}

			dx1 = dx0;
			dy1 = dy0;
		}

		if (!rest_8)
			return (MLIB_SUCCESS);

		vis_alignaddr(px, 0);
		dx0 = dx1;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		vis_alignaddr(py, 0);
		dy0 = dy1;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		SUB_S8_MOD;

/*
 * prepare edge mask for the last bytes
 */

		emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
		vis_pst_8(dz, dpz, ~emask);
	}

	return (MLIB_SUCCESS);
}
Esempio n. 15
0
mlib_status
__mlib_VectorSub_S8_S8_Sat(
	mlib_s8 *z,
	const mlib_s8 *x,
	const mlib_s8 *y,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx, *dpy;
	mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1;
	mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl;
	mlib_d64 dh, dl;
	mlib_s8 *pz = z, *px, *py, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;

/* edge masks */
	mlib_s32 emask;
	mlib_d64 displacement = vis_to_double_dup(0x8000800);
	mlib_d64 restore = vis_to_double_dup(0x80808080);
	mlib_f32 fmul = vis_to_float(0x1000);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s8 *)x;
	py = (mlib_s8 *)y;

/* initialize GSR scale factor */
	vis_write_gsr(3 << 3);

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

	if (off) {
		dpy = (mlib_d64 *)vis_alignaddr(py, off);
		dy0 = vis_ld_d64_nf(dpy);
		dy1 = vis_ld_d64_nf(dpy + 1);
		dy = vis_faligndata(dy0, dy1);
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		SUB_S8_SAT;

/*
 * store first bytes of result
 */

		vis_pst_8(dz, dpz, emask);

		px += (8 + off);
		py += (8 + off);
		len -= (8 + off);
		dpz++;

		if (len <= 0)
			return (MLIB_SUCCESS);
	}

	even_8 = len >> 3;
	rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;
		dpy = (mlib_d64 *)py;

		dx = vis_ld_d64_nf(dpx);
		dpx++;
		dy = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dy1 = vis_ld_d64_nf(dpy);
			SUB_S8_SAT;
			dx = dx1;
			dy = dy1;
/*
 * store 8 bytes of result
 */
			dpz[0] = dz;
			dpx++;
			dpy++;
			dpz++;
		}

		dx1 = dx;
		dy1 = dy;
	} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

		dpx = (mlib_d64 *)px;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		dx = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			dx = vis_ld_d64_nf(dpx);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpx++;
			dpy++;
		}

		dx1 = dx;
		dy1 = dy0;
	} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

		dpy = (mlib_d64 *)py;
		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy = (*dpy++);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}

		dy1 = vis_ld_d64_nf(dpy);
		dpy++;
	} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dpy = vis_alignaddr(py, 0);
		dy1 = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
			dy = vis_faligndata(dy0, dy1);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}
	} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);

/* #pragma pipeloop(0) */
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			vis_alignaddr(py, 0);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			vis_alignaddr(px, 0);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpy++;
			dpx++;
		}

		dx1 = dx0;
		dy1 = dy0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

	vis_alignaddr(px, 0);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_faligndata(dx0, dx1);
	vis_alignaddr(py, 0);
	dy0 = dy1;
	dy1 = vis_ld_d64_nf(dpy);
	dy = vis_faligndata(dy0, dy1);
	SUB_S8_SAT;

/*
 * prepare edge mask for the last bytes
 */

	emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
	vis_pst_8(dz, dpz, ~emask);

	return (MLIB_SUCCESS);
}
Esempio n. 16
0
mlib_status
__mlib_VideoUpSample420(
	mlib_u8 *dst0,
	mlib_u8 *dst1,
	const mlib_u8 *src0,
	const mlib_u8 *src1,
	const mlib_u8 *src2,
	mlib_s32 n)
{
	mlib_u8 *dend0 = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d10, d11, d20, d21;
	mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo;
	mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo;
	mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo;
	mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo;
	mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
	mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7;
	mlib_d64 data0, data1, data2, data3, tmp0, tmp1;
	mlib_f32 fone = vis_to_float(0x4000000);
	mlib_f32 fthree = vis_to_float(0xC000000);
	mlib_f32 fone1 = vis_to_float(0x40404040);
	mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0);
	mlib_d64 dseven = vis_to_double_dup(0x70007);
	mlib_d64 deight = vis_to_double_dup(0x80008);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((3 << 3) + 2);

	d00 = vis_ld_d64_nf(sp0);
	d10 = vis_ld_d64_nf(sp1);
	d20 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone);
	lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone);
	lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone);
	lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone);
	tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree);
	tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree);
	lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0);
	lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1);
	lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0);
	lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1);

#pragma pipeloop(0)
	for (i = 0; i < n - 8; i += 8) {
		d01 = *sp0;
		d11 = *sp1;
		d21 = *sp2;
		sp0++;
		sp1++;
		sp2++;

		thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone);
		thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone);
		thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone);
		thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone);

		tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree);
		tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree);

		thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0);
		thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1);
		thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0);
		thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1);

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, thiscolsum0_hi);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, thiscolsum1_hi);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0));
		dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));
		dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2));

		dp0 += 2;
		dp1 += 2;
		lastcolsum0_hi = thiscolsum0_hi;
		lastcolsum0_lo = thiscolsum0_lo;
		lastcolsum1_hi = thiscolsum1_hi;
		lastcolsum1_lo = thiscolsum1_lo;
	}

	if (i < n) {

		acc0 = vis_fmul8x16(fone1, lastcolsum0_hi);
		acc1 = vis_fmul8x16(fone1, lastcolsum0_lo);
		acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi);
		acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo);
		acc4 = vis_fmul8x16(fone1, lastcolsum1_hi);
		acc5 = vis_fmul8x16(fone1, lastcolsum1_lo);
		acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi);
		acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo);

		shiftcolsum0_hi =
			vis_faligndata(lastcolsum0_hi, lastcolsum0_lo);
		shiftcolsum0_lo =
			vis_faligndata(lastcolsum0_lo, lastcolsum0_lo);
		shiftcolsum1_hi =
			vis_faligndata(lastcolsum1_hi, lastcolsum1_lo);
		shiftcolsum1_lo =
			vis_faligndata(lastcolsum1_lo, lastcolsum1_lo);

		acc0 = vis_fpadd16(acc0, deight);
		acc1 = vis_fpadd16(acc1, deight);
		acc2 = vis_fpadd16(acc2, dseven);
		acc3 = vis_fpadd16(acc3, dseven);
		acc4 = vis_fpadd16(acc4, deight);
		acc5 = vis_fpadd16(acc5, deight);
		acc6 = vis_fpadd16(acc6, dseven);
		acc7 = vis_fpadd16(acc7, dseven);

		ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi);
		ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo);
		ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi);
		ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo);
		ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi);
		ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo);
		ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi);
		ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo);

		acc0 = vis_fpadd16(acc0, ac0);
		acc1 = vis_fpadd16(acc1, ac1);
		acc2 = vis_fpadd16(acc2, ac2);
		acc3 = vis_fpadd16(acc3, ac3);
		acc4 = vis_fpadd16(acc4, ac4);
		acc5 = vis_fpadd16(acc5, ac5);
		acc6 = vis_fpadd16(acc6, ac6);
		acc7 = vis_fpadd16(acc7, ac7);

		data0 = vis_fpack16_pair(acc0, acc1);
		data1 = vis_fpack16_pair(acc2, acc3);
		data2 = vis_fpack16_pair(acc4, acc5);
		data3 = vis_fpack16_pair(acc6, acc7);

		acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0));
		acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2));

		emask = vis_edge8(dp0, dend0);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
		i += 4;
		dp0++;
		dp1++;

		if (i < n) {
			acc0 = vis_fpmerge(vis_read_lo(data1),
				vis_read_lo(data0));
			acc1 = vis_fpmerge(vis_read_lo(data3),
				vis_read_lo(data2));

			emask = vis_edge8(dp0, dend0);
			vis_pst_8(acc0, dp0, emask);
			vis_pst_8(acc1, dp1, emask);
		}
	}

	vis_write_gsr(7);

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;

	ac0 = *dp0;
	ac2 = *dp1;

#pragma pipeloop(0)
	for (i = 0; i < 2 * n - 8; i += 8) {
		ac1 = *dp0;
		ac3 = *dp1;
		*dp0 = vis_faligndata(ac0, ac1);
		*dp1 = vis_faligndata(ac2, ac3);
		dp0++;
		dp1++;
		ac0 = ac1;
		ac2 = ac3;
	}

	if (i < 2 * n) {
		ac1 = vis_ld_d64_nf(dp0);
		ac3 = vis_ld_d64_nf(dp1);
		emask = vis_edge8(dp0, dend0);
		acc0 = vis_faligndata(ac0, ac1);
		acc1 = vis_faligndata(ac2, ac3);
		vis_pst_8(acc0, dp0, emask);
		vis_pst_8(acc1, dp1, emask);
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
Esempio n. 17
0
mlib_status
__mlib_VectorSubS_U8_U8_Mod(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *c,
	mlib_s32 n)
{
/* edge masks */
	mlib_s32 emask;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_s8 *pzend;
	mlib_d64 *dpx, *dpz, *dpzend;
	mlib_d64 dx, dx0, dx1, dr0, dr1, dr;
	mlib_u8 uc = *((mlib_s8 *)c);
	mlib_d64 uncontrol_mask = vis_to_double_dup(0xff00ff00);

/* prepare the scaling factors */
	mlib_d64 dcl = vis_to_double_dup(uc | (uc << 16));
	mlib_d64 dch = vis_to_double_dup((uc << 8) | (uc << 24));
	mlib_s32 scal = uc << 24 | uc << 16 | uc << 8 | uc;
	mlib_s32 sr1, sr2, sr3, sr3_;
	mlib_s32 mask = 0x7f7f7f7f;
	mlib_s32 x8, x12;
	mlib_s32 nrest, i;

	if (n <= 0)
		return (MLIB_FAILURE);

	pzend = (mlib_s8 *)z + n - 1;
	dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7));

/*
 * check for 64-bit aligned special case
 */

	if ((((mlib_addr)x | (mlib_addr)z) & 7) == 0) {

/*
 * We can process source and destination vectors by 16 bytes.
 */

		dpx = (mlib_d64 *)x;
		dpz = (mlib_d64 *)z;
#pragma pipeloop(0)
		for (i = 0; i < n >> 4; i++) {
			mlib_u64 ld0;

			dx = dpx[0];
			SUBS_S8_MOD;
			(*dpz++) = dr;

			ld0 = *((mlib_u64 *)dpx + 1);
			x8 = ld0 >> 32;
			sr1 = x8 ^ ~scal;
			sr2 = (scal | ~mask) - (x8 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			x12 = ld0 & 0xFFFFFFFF;
			sr1 = x12 ^ ~scal;
			sr2 = (scal | ~mask) - (x12 & mask);
			sr3_ = (sr1 & ~mask) ^ sr2;
			(*dpz++) = vis_to_double(sr3, sr3_);
			dpx += 2;
		}

		nrest = n & 0xf;

		if (nrest >= 8) {
			dx = (*dpx++);
			SUBS_S8_MOD;
			(*dpz++) = dr;
			nrest -= 8;
		}

		if (nrest > 0) {
			dx = (*dpx++);
			SUBS_S8_MOD;
			emask = vis_edge8(dpz, pzend);
			vis_pst_8(dr, dpz, emask);
		}

	} else {
void mlib_v_ImageLookUpSI_U16_U8_2_D1(const mlib_u16 *src,
                                      mlib_u8        *dst,
                                      mlib_s32       xsize,
                                      const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3, s4;         /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = dst;

  dend = dl + 2 * xsize - 1;

  vis_alignaddr((void *)0, 7);

  s0 = *sp++;
  *dl++ = tab0[s0];
  dp = (mlib_d64 *) dl;
  xsize--;

  if (xsize >= 4) {

    s1 = sp[0];
    s2 = sp[1];
    s3 = sp[2];
    s4 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab0, s4);
      t6 = VIS_LD_U8_I(tab1, s3);
      t5 = VIS_LD_U8_I(tab0, s3);
      t4 = VIS_LD_U8_I(tab1, s2);
      t3 = VIS_LD_U8_I(tab0, s2);
      t2 = VIS_LD_U8_I(tab1, s1);
      t1 = VIS_LD_U8_I(tab0, s1);
      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = s4;
      s1 = sp[0];
      s2 = sp[1];
      s3 = sp[2];
      s4 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab0, s4);
    t6 = VIS_LD_U8_I(tab1, s3);
    t5 = VIS_LD_U8_I(tab0, s3);
    t4 = VIS_LD_U8_I(tab1, s2);
    t3 = VIS_LD_U8_I(tab0, s2);
    t2 = VIS_LD_U8_I(tab1, s1);
    t1 = VIS_LD_U8_I(tab0, s1);
    t0 = VIS_LD_U8_I(tab1, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    s0 = s4;
    *dp++ = acc;
  }

  num = ((mlib_u8 *) dend - (mlib_u8 *) dp) >> 1;
  sp += num;
  num++;

#pragma pipeloop(0)
  for (i = 0; i < num; i++) {
    s1 = (mlib_s32) * sp;
    sp--;

    t0 = VIS_LD_U8_I(tab1, s1);
    acc = vis_faligndata(t0, acc);

    t0 = VIS_LD_U8_I(tab0, s1);
    acc = vis_faligndata(t0, acc);
  }

  t0 = VIS_LD_U8_I(tab1, s0);
  acc = vis_faligndata(t0, acc);
  emask = vis_edge8(dp, dend);
  vis_pst_8(acc, dp, emask);
}
Esempio n. 19
0
/* The case of even address of vector x */
static void
mlib_VectorDotProd_U8C_al_x(
	mlib_d64 *z,
	const void *x,
	const void *y,
	mlib_s32 n)
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y;
	mlib_d64 sum_r = 0.0, sum_i = 0.0;
	mlib_d64 *dpx, *dpy, *dpxend;
	mlib_d64 dx, dy, dy0, dy1;
	mlib_d64 dx_r, dy_r, dy_i;
	mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl;
	mlib_d64 d_ih, d_il, d_rh, d_rl;
	mlib_d64 ds_r, ds_i, ds1_r, ds1_i;
	mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 edge[2], fzero = vis_fzero();
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask, off;

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	off = (mlib_addr)dpx - (mlib_addr)px;
	dpy = vis_alignaddr((void *)py, off);
	pxend = px + n + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	dy = vis_ld_d64_nf(dpy);

	if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) {
		vis_write_bmask(0x781A3C5E, 0);
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C0;
				SUM_U8C;
				dx = dpx[1];
				dy = dpy[1];
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	} else {
		mlib_s32 mask = ((mlib_addr)(py + off)) & 7;

		vis_write_bmask(0x11111111 * mask, 0x01234567);
		dy1 = vis_ld_d64_nf(dpy+1);
		dy = vis_bshuffle(dy, dy1);
		SET_ALIGN_U8C;
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy+2);
				dx = vis_ld_d64_nf(dpx+1);
				dy = vis_bshuffle(dy0, dy1);
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		SET_ALIGN_U8C;
		DPROD_U8C;
		SUM_U8C_TAIL;
		fsum = vis_read_hi(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = sum_r;
	z[1] = sum_i;
#undef MAX_LOOP
}
void mlib_v_ImageLookUp_S16_U8_3_D1(const mlib_s16 *src,
                                    mlib_u8        *dst,
                                    mlib_s32       xsize,
                                    const mlib_u8  *table0,
                                    const mlib_u8  *table1,
                                    const mlib_u8  *table2)
{
  mlib_s16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_s32 s4, s5, s6, s7;             /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *table;

  dl = dst;
  sp = (void *)src;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 8) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    s4 = sp[4];
    s5 = sp[5];
    s6 = sp[6];
    s7 = sp[7];
    sp += 8;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
      t7 = VIS_LD_U8_I(table1, s7);
      t6 = VIS_LD_U8_I(table0, s6);
      t5 = VIS_LD_U8_I(table2, s5);
      t4 = VIS_LD_U8_I(table1, s4);
      t3 = VIS_LD_U8_I(table0, s3);
      t2 = VIS_LD_U8_I(table2, s2);
      t1 = VIS_LD_U8_I(table1, s1);
      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      table = table0;
      table0 = table2;
      table2 = table1;
      table1 = table;
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      s4 = sp[4];
      s5 = sp[5];
      s6 = sp[6];
      s7 = sp[7];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(table1, s7);
    t6 = VIS_LD_U8_I(table0, s6);
    t5 = VIS_LD_U8_I(table2, s5);
    t4 = VIS_LD_U8_I(table1, s4);
    t3 = VIS_LD_U8_I(table0, s3);
    t2 = VIS_LD_U8_I(table2, s2);
    t1 = VIS_LD_U8_I(table1, s1);
    t0 = VIS_LD_U8_I(table0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    table = table0;
    table0 = table2;
    table2 = table1;
    table1 = table;
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_addr) dend - (mlib_addr) dp;
    sp += num;
    num++;
    i = num - 3 * (num / 3);

    if (i == 2) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num -= 2;
    }
    else if (i == 1) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num--;
    }

#pragma pipeloop(0)
    for (i = 0; i < num; i += 3) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table2, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge8(dp, dend);
    vis_pst_8(acc, dp, emask);
  }
}
Esempio n. 21
0
mlib_status
mlib_v_ImageAdd_U8(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_d64 sd1h, sd2h, sd1l, sd2l, rdh, rdl;
	mlib_u8 *dend;
	mlib_f32 nul = vis_to_float(0), fone = vis_to_float(0x100);

	VALIDATE(mlib_u8);

/* initialize GSR scale factor */
	vis_write_gsr(7 << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 7) == 0) &&
	    (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xff) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_U8(sd10, sd20, dd0);

			if (emask != 0xff) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 8); k += 8) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd11, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
Esempio n. 22
0
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
Esempio n. 23
0
void
mlib_v_ImageSqrShift_U8(
    mlib_u8 *src,
    mlib_s32 slb,
    mlib_u8 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize)
{
/* pointer to a line in source image */
	mlib_u8 *sl;

/* 8-byte aligned pointer to source image */
	mlib_d64 *sp;

/* pointer to a line in destination image */
	mlib_u8 *dl;

/* pointer to end of a line in destination image */
	mlib_u8 *dend;

/* 8-byte aligned pointer to destination image */
	mlib_d64 *dp;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 sdh, sdl;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, j, n;

	sl = src;
	dl = dst;

/* row loop */
	for (j = 0; j < ysize; j++) {

/* prepare the destination address */
		dp = (mlib_d64 *)((mlib_addr)dl & (~7));
		off = (mlib_addr)dp - (mlib_addr)dl;
		dend = dl + xsize - 1;

/* prepare the source address */
		sp = (mlib_d64 *)vis_alignaddr(sl, off);

/* generate edge mask for the start point */
		emask = vis_edge8(dl, dend);

/* first 8 pixels */
		s0 = vis_ld_d64_nf(sp); sp++;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
		vis_pst_8(dd, dp++, emask);

		n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 8-pixel column loop */
#pragma pipeloop(0)
		for (i = 0; i < n; i++) {
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
			(*dp++) = dd;
		}

/* end point handling */

		if ((mlib_addr)dp <= (mlib_addr)dend) {
			emask = vis_edge8(dp, dend);
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQRSHIFT_U8(sd, dd);
			vis_pst_8(dd, dp++, emask);
		}

		sl += slb;
		dl += dlb;
	}
}
void
mlib_v_ImageColorRGB2Mono_U8_D1(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 dsize,
    const mlib_d64 *weight)
{
    mlib_u8 *dst_end;
    mlib_d64 dd, d0, d1, d2, d3;
    mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt;
    mlib_d64 *src_all, *dp;
    mlib_f32 d32, e32, alpha, gamma, beta;
    mlib_d64 sd0, sd1, sd2;
    mlib_s32 i, emask;
    mlib_s32 off;
    mlib_s32 mask0 = 0x0369147a;
    mlib_s32 mask1 = 0x258b258b;
    mlib_s32 mask2 = 0x47ad58be;
    mlib_s32 mask3 = 0x69cf69cf;

    /* prepare the weight */
    alpha = vis_to_float(weight[0] * 8192);
    beta = vis_to_float(weight[1] * 8192);
    gamma = vis_to_float(weight[2] * 8192);
    vis_write_gsr(2 << 3);

    dp = (mlib_d64 *)((mlib_addr)dst & (~7));
    off = (mlib_addr)dp - (mlib_addr)dst;

    dst_end = dst + (dsize - 1);
    emask = vis_edge8(dst, dst_end);
    src_all = vis_alignaddr((void *)src, (3 * off));

    d0 = (*src_all++);
    d1 = (*src_all++);
    d2 = (*src_all++);
    d3 = (*src_all++);

    sd0 = vis_faligndata(d0, d1);
    sd1 = vis_faligndata(d1, d2);
    sd2 = vis_faligndata(d2, d3);

    CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
    CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
    vis_pst_8(dd, dp, emask);
    dp++;

#pragma pipeloop(0)
    for (i = 8 + off; i <= (dsize - 8); i += 8) {
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);

        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        (*dp++) = dd;
    }

    if ((mlib_addr)dp <= (mlib_addr)dst_end) {

        emask = vis_edge8(dp, dst_end);
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);
        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        vis_pst_8(dd, dp, emask);
    }
}
static mlib_status
mlib_v_VideoColorYUV2ABGR411_dst_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu, fv;

/* y data */
	mlib_d64 dy0, dy1, dy2;
	mlib_d64 ddy1, ddy2, ddy3, ddy4;
	mlib_d64 du0, du1;
	mlib_d64 dv1, dv2;
	mlib_d64 dr, dr1, dr2, dr3, dr4;
	mlib_d64 dg, dg1, dg2, dg3, dg4;
	mlib_d64 db, db1, db2, db3, db4;
	mlib_d64 dd, dd0, dd1, dtmp;

/* used to load u, v into mlib_f32 */
	mlib_f32 ffu[1], ffv[1];

/* used to load u, v into mlib_f32 */
	mlib_u8 *ufu, *vfu;

/* 1.1644  * 4096 */
	mlib_f32 f0 = vis_to_float(0x12a1);

/* 2.0184  * 8192 */
	mlib_f32 f1 = vis_to_float(0x4097);

/* -0.3920 * 8192 */
	mlib_f32 f4 = vis_to_float(0xf375);

/* -0.8132 * 8192 */
	mlib_f32 f5 = vis_to_float(0xe5fa);

/* 1.5966  * 8192 */
	mlib_f32 f8 = vis_to_float(0x3317);

/* -276.9856 * 32 */
	mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

/* 135.6352  * 32 */
	mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

/* -222.9952 * 32 */
	mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
	mlib_f32 fscale = vis_to_float(0x80808080);

/* loop variables */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_d64 *buf;
	mlib_s32 inc;

	ufu = (mlib_u8 *)ffu;
	vfu = (mlib_u8 *)ffv;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(3 << 3);

	buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64));

	if (buf == NULL)
		return (MLIB_FAILURE);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = buf;
		dy0 = vis_ld_d64_nf(spy); spy++;

#pragma pipeloop(0)
		for (i = 0; i < width; i += 8) {
			dy1 = vis_ld_d64_nf(spy); spy++;
			(*dpp++) = vis_faligndata(dy0, dy1);
			dy0 = dy1;
		}

		spy = buf;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);

		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		vis_alignaddr((void *)(8 - i), 0);
		inc = (emask1 != 0xff);
		emask1 &= emask;

		ufu[0] = vis_ld_u8_nf(sp2);
		ufu[1] = vis_ld_u8_nf(sp2 + 1);
		ufu[2] = vis_ld_u8_nf(sp2 + 2);
		ufu[3] = vis_ld_u8_nf(sp2 + 3);
		vfu[0] = vis_ld_u8_nf(sp3);
		vfu[1] = vis_ld_u8_nf(sp3 + 1);
		vfu[2] = vis_ld_u8_nf(sp3 + 2);
		vfu[3] = vis_ld_u8_nf(sp3 + 3);
		sp2 += 4;
		sp3 += 4;

		fu = ffu[0];
		fv = ffv[0];

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 16; i += 16) {

			dy1 = (*spy++);
			dy2 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = vis_ld_u8_nf(sp2);

			ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
			ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);
			ufu[1] = vis_ld_u8_nf(sp2 + 1);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			ufu[2] = vis_ld_u8_nf(sp2 + 2);

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);
			ufu[3] = vis_ld_u8_nf(sp2 + 3);

			db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
			db3 = vis_fpadd16(ddy3, db3);
			vfu[0] = vis_ld_u8_nf(sp3);

			db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
			db4 = vis_fpadd16(ddy4, db4);
			vfu[1] = vis_ld_u8_nf(sp3 + 1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);
			vfu[2] = vis_ld_u8_nf(sp3 + 2);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);
			vfu[3] = vis_ld_u8_nf(sp3 + 3);

			dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
			dg3 = vis_fpadd16(ddy3, dg3);

			dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
			dg4 = vis_fpadd16(ddy4, dg4);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
			dr3 = vis_fpadd16(ddy3, dr3);

			dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
			dr4 = vis_fpadd16(ddy4, dr4);

			dr = vis_fpack16_pair(dr1, dr2);
			dr1 = vis_fpack16_pair(dr3, dr4);

			dg = vis_fpack16_pair(dg1, dg2);
			dg1 = vis_fpack16_pair(dg3, dg4);

			db = vis_fpack16_pair(db1, db2);
			db1 = vis_fpack16_pair(db3, db4);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1));
			dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1));
			dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
			fu = ffu[0];
			fv = ffv[0];
			sp2 += 4;
			sp3 += 4;
			emask1 = emask;
		}

		if (i <= width - 8) {

			dy1 = (*spy++);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
			ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);
			ufu[0] = ufu[2];

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);
			vfu[0] = vfu[2];

			db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
			db2 = vis_fpadd16(ddy2, db2);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
			dg2 = vis_fpadd16(ddy2, dg2);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
			dr2 = vis_fpadd16(ddy2, dr2);

			dr = vis_fpack16_pair(dr1, dr2);
			dg = vis_fpack16_pair(dg1, dg2);
			db = vis_fpack16_pair(db1, db2);

			dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg));
			dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg));
			dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);
			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			fu = ffu[0];
			fv = ffv[0];

			i += 8;
			emask1 = emask;
		}

		if (i < width) {

			dy1 = vis_ld_d64_nf(spy);

			du0 = vis_fmul8x16al(fu, f1);
			db = vis_fpadd16(du0, doff0);

			du1 = vis_fmul8x16al(fu, f4);
			dv1 = vis_fmul8x16al(fv, f5);
			dtmp = vis_fpadd16(du1, dv1);
			dg = vis_fpadd16(dtmp, doff1);

			dv2 = vis_fmul8x16al(fv, f8);
			dr = vis_fpadd16(dv2, doff2);

			ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);

			db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
			db1 = vis_fpadd16(ddy1, db1);

			dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
			dg1 = vis_fpadd16(ddy1, dg1);

			dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
			dr1 = vis_fpadd16(ddy1, dr1);

			fu = vis_fpack16(db1);

			dg2 = vis_fpmerge(fu, vis_fpack16(dg1));
			dg3 = vis_fpmerge(fu, vis_fpack16(dr1));

			dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;

			dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);
		}

		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}
	__mlib_free(buf);
	return (MLIB_SUCCESS);
}