void
mlib_v_ImageLookUp_S16_S16_3_D1(
    const mlib_s16 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 *table0,
    const mlib_s16 *table1,
    const mlib_s16 *table2)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* pointer to start of destination */
	mlib_s16 *dl;

/* pointer to end of destination */
	mlib_s16 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2, t3;

/* destination data */
	mlib_d64 acc0, acc1;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;
	const mlib_s16 *table;

	dl = dst;
	sp = (void *)src;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 6);

	i = 0;

	if (xsize >= 4) {

		s0 = sp[0] << 1;
		s1 = sp[1] << 1;
		s2 = sp[2] << 1;
		s3 = sp[3] << 1;
		sp += 4;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
			t3 = VIS_LD_U16_I(table0, s3);
			t2 = VIS_LD_U16_I(table2, s2);
			t1 = VIS_LD_U16_I(table1, s1);
			t0 = VIS_LD_U16_I(table0, s0);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0] << 1;
			s1 = sp[1] << 1;
			s2 = sp[2] << 1;
			s3 = sp[3] << 1;
			(*dp++) = vis_bshuffle(acc0, acc1);
			table = table0;
			table0 = table1;
			table1 = table2;
			table2 = table;
		}

		t3 = VIS_LD_U16_I(table0, s3);
		t2 = VIS_LD_U16_I(table2, s2);
		t1 = VIS_LD_U16_I(table1, s1);
		t0 = VIS_LD_U16_I(table0, s0);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
		table = table0;
		table0 = table1;
		table1 = table2;
		table2 = table;
		i += 4;
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_s16 *)dend - (mlib_s16 *)dp;
		sp += num;
		num++;

		if (num == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table2, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge16(dp, dend);
		vis_pst_16(acc0, dp, emask);
	}
}
Ejemplo n.º 2
0
mlib_status
__mlib_VectorConvert_U8_S32_Sat(
	mlib_u8 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	mlib_s32 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d0, d_tmp, d1, d2, d3, d4;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s32 c;

	if (n < 8) {
		PACK_S_S(mlib_s32, mlib_u8, MLIB_U8_MAX, 0);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c =
			(*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;

	vis_write_gsr(23 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
#pragma unroll(4)
		for (i = 0; i < len_64; i++) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = (*dsrc++);
			d1 = vis_fpack32(d1, d1);
			d2 = vis_fpack32(d1, d2);
			d3 = vis_fpack32(d2, d3);
			d4 = vis_fpack32(d3, d4);
			(*ddst++) =
				vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4));
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d0 = (*dsrc++);

#pragma pipeloop(0)
#pragma unroll(4)
		for (i = 0; i < len_64; i++) {
			d_tmp = (*dsrc++);
			d1 = vis_faligndata(d0, d_tmp);
			d0 = (*dsrc++);
			d2 = vis_faligndata(d_tmp, d0);
			d_tmp = (*dsrc++);
			d3 = vis_faligndata(d0, d_tmp);
			d0 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d_tmp, d0);
			d1 = vis_fpack32(d1, d1);
			d2 = vis_fpack32(d1, d2);
			d3 = vis_fpack32(d2, d3);
			d4 = vis_fpack32(d3, d4);
			(*ddst++) =
				vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4));
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c < MLIB_U8_MIN ? MLIB_U8_MIN
			: (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 3
0
mlib_status
__mlib_VectorSubS_S32_S16_Mod(
	mlib_s32 *z,
	const mlib_s16 *x,
	const mlib_s16 *c,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx;
	mlib_d64 dx, dx0, dx1;
	mlib_d64 dr1, dr2, dzh, dzl;
	mlib_f32 fone = vis_to_float(0x10001);
	mlib_s32 uc = *((mlib_s16 *)c);
	mlib_s16 *px;
	mlib_s32 *pz;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_d64 dc = vis_to_double_dup(uc);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s16 *)x;
	pz = (mlib_s32 *)z;

	if (n <= 4)
		SUBS_S32_S16_IN_C;

/*
 * prepare the destination address
 */

	while ((mlib_addr)pz & 7) {
		(*pz++) = uc - ((mlib_s32)(*px));
		px++;
		len--;
	}

	dpz = (mlib_d64 *)pz;

	even_8 = len >> 2;
	rest_8 = len & 0x3;

	if (!((mlib_addr)px & 7)) {

/*
 * 'x' address is 8-byte aligned.
 * No  vis_alignaddr and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			SUBS_S32_S16_MOD;

/*
 * store 16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
		}

	} else {

/*
 * "x"  address is arbitrary aligned.
 * 1 vis_alignaddr and 1 vis_faligndata in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUBS_S32_S16_MOD;
			dx0 = dx1;
/*
 * store 16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
		}
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

	px += (even_8 << 2);
	pz += (even_8 << 2);

	while (rest_8--) {
		(*pz++) = uc - ((mlib_s32)(*px));
		px++;
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 4
0
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 fzero = vis_fzeros();
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_u8 c;
	mlib_d64 dsp = vis_to_double_dup(0x800080);
	mlib_d64 rst = vis_to_double_dup(0x80808080);
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d4 = (*dsrc++);
			d2 = vis_fpmerge(fzero, vis_read_hi(d1));
			d3 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d1 = vis_fpack16_pair(d2, d3);
			d2 = vis_fpmerge(fzero, vis_read_hi(d4));
			d3 = vis_fmul8x16al(vis_read_lo(d4), fm);
			d2 = vis_fpadd16(dsp, d2);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpack16_pair(d2, d3);
			(*ddst++) = vis_fxor(d1, rst);
			(*ddst++) = vis_fxor(d4, rst);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d1 = vis_faligndata(d1, d2);
			d3 = vis_fmul8x16al(vis_read_hi(d1), fm);
			d4 = vis_fmul8x16al(vis_read_lo(d1), fm);
			d3 = vis_fpadd16(dsp, d3);
			d4 = vis_fpadd16(dsp, d4);
			d1 = vis_fpack16_pair(d3, d4);
			(*ddst++) = vis_fxor(d1, rst);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_faligndata(d1, d2);
			d4 = vis_fmul8x16al(vis_read_hi(d3), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d3), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d3 = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8x16al(vis_read_hi(d6), fm);
			d5 = vis_fmul8x16al(vis_read_lo(d6), fm);
			d4 = vis_fpadd16(dsp, d4);
			d5 = vis_fpadd16(dsp, d5);
			d6 = vis_fpack16_pair(d4, d5);
			(*ddst++) = vis_fxor(d3, rst);
			(*ddst++) = vis_fxor(d6, rst);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c;

	return (MLIB_SUCCESS);
}
Ejemplo n.º 5
0
mlib_status
__mlib_VectorConvert_U8_S16_Sat(
	mlib_u8 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6, d7;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s16 c;

	if (n < 16) {
		PACK_S_U_DF(mlib_s16, mlib_u8, MLIB_U8_MAX, 0);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c =
			(*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d4 = (*dsrc++);
			d5 = (*dsrc++);
			d3 = vis_fpack16_pair(d4, d5);
			(*ddst++) = d3;
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d5 = (*dsrc++);
			d6 = (*dsrc++);
			d3 = vis_fpack16_pair(d1, d2);
			d7 = vis_fpack16_pair(d5, d6);
			(*ddst++) = d3;
			(*ddst++) = d7;
		}
	} else {

/*
 * Source address is 2-byte aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d1, d2);
			d3 = vis_fpack16_pair(d3, d4);
			(*ddst++) = d3;
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = d2;
			d2 = (*dsrc++);
			d3 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d5 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d6 = vis_faligndata(d1, d2);
			d3 = vis_fpack16_pair(d3, d4);
			d5 = vis_fpack16_pair(d5, d6);
			(*ddst++) = d3;
			(*ddst++) = d5;
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c =
			src[even_length + i]) < 0 ? 0 : (c >
			MLIB_U8_MAX ? MLIB_U8_MAX : c);

	return (MLIB_SUCCESS);
}
Ejemplo n.º 6
0
mlib_status
mlib_v_ImageAdd_U8(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_d64 sd1h, sd2h, sd1l, sd2l, rdh, rdl;
	mlib_u8 *dend;
	mlib_f32 nul = vis_to_float(0), fone = vis_to_float(0x100);

	VALIDATE(mlib_u8);

/* initialize GSR scale factor */
	vis_write_gsr(7 << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 7) == 0) &&
	    (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xff) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xff) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 7) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_U8(sd10, sd20, dd0);

			if (emask != 0xff) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 8;
			}
#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u8 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge8(dp, dend);

			if (emask != 0xff) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				vis_pst_8(dd, dpp++, emask);
				i += 8;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 8); k += 8) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 8; i += 8) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U8(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge8(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_U8(sd11, sd20, dd);
				vis_pst_8(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 7
0
mlib_status
mlib_v_ImageAdd_U16(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask, mask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr, tmp;
	mlib_d64 sd10, sd11, sd20, sd21;
	mlib_d64 ones = vis_to_double_dup(0x7fff7fff);
	mlib_d64 max_u16 = vis_to_double_dup(0xffffffff);
	mlib_u16 *dend;

	VALIDATE(mlib_u16);

/* initialize GSR scale factor */
	vis_write_gsr(15 << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16(sd10, sd20, dpp)
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {
Ejemplo n.º 8
0
mlib_status
__mlib_VectorSub_U8_U8_Mod(
	mlib_u8 *z,
	const mlib_u8 *x,
	const mlib_u8 *y,
	mlib_s32 n)
{
/* 8-byte aligned start point in destination */
	mlib_d64 *dpz;

/* 8-byte aligned start point in source */
	mlib_d64 *dpx, *dpy;

/* source  data */
	mlib_d64 dx, dy, dx0;
	mlib_d64 dx1, dy0, dy1;

/* destination data */
	mlib_d64 dz;

/* intermediate result */
	mlib_d64 dh, dl;
	mlib_d64 dxl, dyl;

/* end point of a line in destination */
	mlib_u8 *pzend;

/* start point of a line in source */
	mlib_u8 *px, *py;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge mask */
	mlib_s32 emask;
	mlib_u8 *pzend16;
	mlib_s32 sr1, sr2, sr3;
	mlib_s32 x8, x12, y8, y12;
	mlib_s32 mask = 0x7f7f7f7f;
	mlib_u8 *pz;
	mlib_s32 n16;
	mlib_s32 nrest;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_d64 mask_control = vis_to_double_dup(0xff00ff00);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_u8 *)x;
	py = (mlib_u8 *)y;
	pz = (mlib_u8 *)z;

/*
 * prepare the destination address
 */
	pzend = pz + n - 1;

/*
 * check for 64-bit aligned special case
 */

	if ((((mlib_addr)x | (mlib_addr)y | (mlib_addr)z) & 7) == 0) {

/*
 * We can process source and destination vectors by 16 bytes.
 */

		dpx = (mlib_d64 *)x;
		dx = vis_ld_d64_nf(dpx);
		dpy = (mlib_d64 *)y;
		dy = vis_ld_d64_nf(dpy);
		dpz = (mlib_d64 *)z;
		n16 = n & (~0xf);
		pzend16 = pz + n16;
#pragma pipeloop(0)
		while ((mlib_addr)pz < (mlib_addr)pzend16) {
			x8 = *((mlib_s32 *)(px + 8));
			y8 = *((mlib_s32 *)(py + 8));
			sr1 = x8 ^ ~y8;
			sr2 = (x8 | ~mask) - (y8 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 8)) = sr3;
			x12 = *((mlib_s32 *)(px + 12));
			y12 = *((mlib_s32 *)(py + 12));
			sr1 = x12 ^ ~y12;
			sr2 = (x12 | ~mask) - (y12 & mask);
			sr3 = (sr1 & ~mask) ^ sr2;
			*((mlib_s32 *)(pz + 12)) = sr3;
			SUB_S8_MOD;
/* store 8 bytes of result */
			*((mlib_d64 *)pz) = dz;
			dx = vis_ld_d64_nf(px + 16);
			dy = vis_ld_d64_nf(py + 16);
			px += 16;
			py += 16;
			pz += 16;
		}

		dpz = (mlib_d64 *)pzend16;
		nrest = n - n16;

		if (nrest >= 8) {
			SUB_S8_MOD;
			dpz[0] = dz;
			px += 8;
			py += 8;
			dpz++;
			nrest -= 8;
		}

		if (nrest > 0) {
			dx = *((mlib_d64 *)px);
			dy = *((mlib_d64 *)py);
			SUB_S8_MOD;
			emask = vis_edge8(dpz, pzend);
			vis_pst_8(dz, dpz, emask);
		}
	} else {

/*
 * General case.
 */

		dpz = (mlib_d64 *)((mlib_addr)z & (~7));
		off = (mlib_addr)dpz - (mlib_addr)z;
/*
 * generate edge mask for the start point
 */
		emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

		if (off) {
			dpy = (mlib_d64 *)vis_alignaddr(py, off);
			dy0 = vis_ld_d64_nf(dpy);
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			dpx = (mlib_d64 *)vis_alignaddr(px, off);
			dx0 = vis_ld_d64_nf(dpx);
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_MOD;

/*
 * store first bytes of result
 */

			vis_pst_8(dz, dpz, emask);

			px += (8 + off);
			py += (8 + off);
			len -= (8 + off);
			dpz++;

			if (len <= 0)
				return (MLIB_SUCCESS);
		}

		even_8 = len >> 3;
		rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

		if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

			dpx = (mlib_d64 *)px;
			dpy = (mlib_d64 *)py;

			dx = vis_ld_d64_nf(dpx);
			dpx++;
			dy = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dx1 = vis_ld_d64_nf(dpx);
				dy1 = vis_ld_d64_nf(dpy);
				SUB_S8_MOD;
				dx = dx1;
				dy = dy1;
/*
 * store 8 bytes of result
 */
				dpz[0] = dz;
				dpx++;
				dpy++;
				dpz++;
			}

			dx1 = dx;
			dy1 = dy;
		} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

			dpx = (mlib_d64 *)px;
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);
			dx = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				dx = vis_ld_d64_nf(dpx);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpx++;
				dpy++;
			}

			dx1 = dx;
			dy1 = dy0;
		} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

			dpy = (mlib_d64 *)py;
			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy = (*dpy++);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}

			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
		} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dpy = vis_alignaddr(py, 0);
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy);
				dpy++;
				dy = vis_faligndata(dy0, dy1);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx);
				dpx++;
				dx = vis_faligndata(dx0, dx1);
				SUB_S8_MOD;
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
			}
		} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

			dpx = vis_alignaddr(px, 0);
			dx0 = vis_ld_d64_nf(dpx);
			dpx++;
			dx1 = vis_ld_d64_nf(dpx);
			dx = vis_faligndata(dx0, dx1);
			dpy = vis_alignaddr(py, 0);
			dy0 = vis_ld_d64_nf(dpy);
			dpy++;
			dy1 = vis_ld_d64_nf(dpy);
			dy = vis_faligndata(dy0, dy1);

#pragma pipeloop(0)
			for (i = 0; i < even_8; i++) {
				SUB_S8_MOD;
				vis_alignaddr(py, 0);
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy + 1);
				dy = vis_faligndata(dy0, dy1);
				vis_alignaddr(px, 0);
				dx0 = dx1;
				dx1 = vis_ld_d64_nf(dpx + 1);
				dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
				(*dpz++) = dz;
				dpy++;
				dpx++;
			}

			dx1 = dx0;
			dy1 = dy0;
		}

		if (!rest_8)
			return (MLIB_SUCCESS);

		vis_alignaddr(px, 0);
		dx0 = dx1;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		vis_alignaddr(py, 0);
		dy0 = dy1;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		SUB_S8_MOD;

/*
 * prepare edge mask for the last bytes
 */

		emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
		vis_pst_8(dz, dpz, ~emask);
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 9
0
mlib_status
__mlib_SignalEmphasize_S16S_S16S_Sat(
    mlib_s16 *dst,
    const mlib_s16 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_emphasize_struct *fist = filter;
	mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF);
	mlib_d64 w_maskor0  = vis_freg_pair(0.f, fist->v16_last0);
	mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000);
	mlib_d64 w_maskor1  = vis_freg_pair(0.f, fist->v16_last1);
	mlib_f32 v_mask	    = vis_to_float(0x80008000);
	mlib_f32 v_alpha    = fist->v_alpha;
	mlib_s16 *fdst	    = dst + n + n - 1;
	mlib_d64 *dpd, *dps, *dsrct1;
	mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc;
	mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7;
	mlib_s32 i, times, t1, t2;

/* check for obvious errors */

	if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) ||
	    (fist->type != MLIB_EMPH)) {
		return (MLIB_FAILURE);
	}

	vis_write_gsr(1 << 3);
	w_maskor0 = vis_fand(w_maskor0, w_maskand1);
	w_maskor1 = vis_fand(w_maskor1, w_maskand0);

	vis_alignaddr((void *)(-(mlib_addr)src), 0);
	w_maskand0 = vis_faligndata(w_maskand0, w_maskand0);
	w_maskor0 = vis_faligndata(w_maskor0, w_maskor0);
	w_maskand1 = vis_faligndata(w_maskand1, w_maskand1);
	w_maskor1 = vis_faligndata(w_maskor1, w_maskor1);

	dpd = vis_alignaddr(dst, 0);
	times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd;
	t1 = -((mlib_addr)(dst) & 7);
	t2 = t1 - 4;
	dps = vis_alignaddr((void *)src, t2);
	w_src0 = vis_ld_d64_nf(dps);
	dps++;
	w_src1 = vis_ld_d64_nf(dps);
	dps++;

	if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) {
		if (((mlib_addr)dps - (mlib_addr)src) >= 6) {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand0, w_src1);
			w_src1 = vis_for(w_maskor0, w_src1);
		}

		if (((mlib_addr)dps - (mlib_addr)src) >= 8) {
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
		}

		w_lsrc = vis_faligndata(w_src0, w_src1);
		dsrct1 = vis_alignaddr((void *)src, t1);

		if (dps - 2 != dsrct1) {
			w_src2 = *dps;
			dps++;
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = *dps;
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dpd++;
				dps++;
			}
		} else {
			w_src = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;

			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			MLIB_MIX;
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;
				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;

				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;
				dpd++;
			}
		}
	} else {
		w_src = w_src1;

		if ((mlib_addr)src & 7) {
			times--;

			if (((mlib_addr)src & 7) == 2) {
				w_src0 = vis_fand(w_maskand0, w_src0);
				w_src0 = vis_for(w_maskor0, w_src0);
			} else {
				w_src1 = vis_fand(w_maskand0, w_src1);
				w_src1 = vis_for(w_maskor0, w_src1);
			}

			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			w_src0 = w_src1;
			w_src1 = *dps;
			w_src = w_src1;
			w_lsrc = vis_faligndata(w_src0, w_src1);
			dps++;

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
			dpd++;
		} else {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;
		}

		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);

		MLIB_MIX;

		w_src1 = w_src;
		w_dst = vis_fpackfix_pair(dr2, dr3);
		dps++;
		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);
		dps++;

		for (i = 0; i < times; i++) {
			*dpd = w_dst;

			MLIB_MIX;

			w_src1 = w_src;
			w_src = vis_ld_d64_nf(dps);
			w_lsrc = vis_faligndata(w_src1, w_src);
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			dpd++;

		}
	}

	if (times >= 0) {
		vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst));
	}
	((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2];
	((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1];

	return (MLIB_SUCCESS);
}
Ejemplo n.º 10
0
mlib_status
__mlib_VectorSub_S16_S8_Mod(
	mlib_s16 *z,
	const mlib_s8 *x,
	const mlib_s8 *y,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx, *dpy;
	mlib_d64 dx, dy, dx0, dx1, dy0, dy1;
	mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl;
	mlib_f32 fone = vis_to_float(0x100);
	mlib_s8 *px, *py;
	mlib_s16 *pz;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_d64 restore = vis_to_double_dup(0x80808080);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s8 *)x;
	py = (mlib_s8 *)y;
	pz = (mlib_s16 *)z;

	if (n <= 8) {
		SUB_S16_S8_IN_C;
	}

/*
 * prepare the source address
 */

	while ((mlib_addr)pz & 7) {
		(*pz++) = ((mlib_s16)(*px)) - (*py);
		px++;
		py++;
		len--;
	}

	dpz = (mlib_d64 *)pz;

	even_8 = len >> 3;
	rest_8 = len & 0x7;

	if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned.
 * No  vis_alignaddr and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;
		dpy = (mlib_d64 *)py;

		dx = vis_ld_d64_nf(dpx);
		dy = vis_ld_d64_nf(dpy);
		dpx++;
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dy1 = vis_ld_d64_nf(dpy);
			SUB_S8_S16;
			dx = dx1;
			dy = dy1;

/*
 * store 16 bytes of result
 */
			(*dpz++) = dzh;
			(*dpz++) = dzl;
			dpx++;
			dpy++;
		}
	} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned.
 * vis_alignaddr and vis_faligndata only for "y".**
 */

		dpx = (mlib_d64 *)px;
		dpy = vis_alignaddr(py, 0);
		dy1 = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
			dy = vis_faligndata(dy0, dy1);
			SUB_S8_S16;
/*
 * store 16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
		}
	} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned.
 * vis_alignaddr and vis_faligndata only for "x".**
 */

		dpy = (mlib_d64 *)py;
		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy = (*dpy++);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_S16;

/*
 * store16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
		}
	} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dpy = vis_alignaddr(py, 0);
		dy1 = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
			dy = vis_faligndata(dy0, dy1);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_S16;

/*
 * store 16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
		}
	} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			SUB_S8_S16;
			vis_alignaddr(py, 0);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			vis_alignaddr(px, 0);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
/*
 * store 16 bytes of result
 */
			dpz[0] = dzh;
			dpz[1] = dzl;
			dpz += 2;
			dpy++;
			dpx++;
		}
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

	px += (even_8 << 3);
	py += (even_8 << 3);
	pz += (even_8 << 3);

	while (rest_8--) {
		(*pz++) = ((mlib_s16)(*px)) - (*py);
		px++;
		py++;
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 11
0
mlib_status
__mlib_VectorSub_S8_S8_Sat(
	mlib_s8 *z,
	const mlib_s8 *x,
	const mlib_s8 *y,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx, *dpy;
	mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1;
	mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl;
	mlib_d64 dh, dl;
	mlib_s8 *pz = z, *px, *py, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;

/* edge masks */
	mlib_s32 emask;
	mlib_d64 displacement = vis_to_double_dup(0x8000800);
	mlib_d64 restore = vis_to_double_dup(0x80808080);
	mlib_f32 fmul = vis_to_float(0x1000);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s8 *)x;
	py = (mlib_s8 *)y;

/* initialize GSR scale factor */
	vis_write_gsr(3 << 3);

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge8(pz, pzend);

/*
 * prepare the source address
 */

	if (off) {
		dpy = (mlib_d64 *)vis_alignaddr(py, off);
		dy0 = vis_ld_d64_nf(dpy);
		dy1 = vis_ld_d64_nf(dpy + 1);
		dy = vis_faligndata(dy0, dy1);
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		SUB_S8_SAT;

/*
 * store first bytes of result
 */

		vis_pst_8(dz, dpz, emask);

		px += (8 + off);
		py += (8 + off);
		len -= (8 + off);
		dpz++;

		if (len <= 0)
			return (MLIB_SUCCESS);
	}

	even_8 = len >> 3;
	rest_8 = len & 0x7;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;
		dpy = (mlib_d64 *)py;

		dx = vis_ld_d64_nf(dpx);
		dpx++;
		dy = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dy1 = vis_ld_d64_nf(dpy);
			SUB_S8_SAT;
			dx = dx1;
			dy = dy1;
/*
 * store 8 bytes of result
 */
			dpz[0] = dz;
			dpx++;
			dpy++;
			dpz++;
		}

		dx1 = dx;
		dy1 = dy;
	} else if ((!((mlib_addr)px & 7))) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

		dpx = (mlib_d64 *)px;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);
		dx = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			dx = vis_ld_d64_nf(dpx);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpx++;
			dpy++;
		}

		dx1 = dx;
		dy1 = dy0;
	} else if ((!((mlib_addr)py & 7))) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

		dpy = (mlib_d64 *)py;
		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy = (*dpy++);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}

		dy1 = vis_ld_d64_nf(dpy);
		dpy++;
	} else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx1 = vis_ld_d64_nf(dpx);
		dpx++;
		dpy = vis_alignaddr(py, 0);
		dy1 = vis_ld_d64_nf(dpy);
		dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy);
			dpy++;
			dy = vis_faligndata(dy0, dy1);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUB_S8_SAT;
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
		}
	} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;
		dx1 = vis_ld_d64_nf(dpx);
		dx = vis_faligndata(dx0, dx1);
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy);
		dpy++;
		dy1 = vis_ld_d64_nf(dpy);
		dy = vis_faligndata(dy0, dy1);

/* #pragma pipeloop(0) */
		for (i = 0; i < even_8; i++) {
			SUB_S8_SAT;
			vis_alignaddr(py, 0);
			dy0 = dy1;
			dy1 = vis_ld_d64_nf(dpy + 1);
			dy = vis_faligndata(dy0, dy1);
			vis_alignaddr(px, 0);
			dx0 = dx1;
			dx1 = vis_ld_d64_nf(dpx + 1);
			dx = vis_faligndata(dx0, dx1);
/*
 * store 8 bytes of result
 */
			(*dpz++) = dz;
			dpy++;
			dpx++;
		}

		dx1 = dx0;
		dy1 = dy0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

	vis_alignaddr(px, 0);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_faligndata(dx0, dx1);
	vis_alignaddr(py, 0);
	dy0 = dy1;
	dy1 = vis_ld_d64_nf(dpy);
	dy = vis_faligndata(dy0, dy1);
	SUB_S8_SAT;

/*
 * prepare edge mask for the last bytes
 */

	emask = vis_edge8((void *)(rest_8), pzend);

/* store last bytes of result */
	vis_pst_8(dz, dpz, ~emask);

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUpSI_S16_U8_4_DstOff3_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 **table)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2;

/* pointer to start of destination */
	mlib_u8 *dl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc;

/* loop variable */
	mlib_s32 i;
	const mlib_u8 *tab0 = &table[0][32768];
	const mlib_u8 *tab1 = &table[1][32768];
	const mlib_u8 *tab2 = &table[2][32768];
	const mlib_u8 *tab3 = &table[3][32768];

	sp = (void *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;

	vis_alignaddr((void *)0, 7);

	s0 = (*sp++);

	if (xsize >= 2) {

		s1 = sp[0];
		s2 = sp[1];
		sp += 2;

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
			t7 = VIS_LD_U8_I(tab2, s2);
			t6 = VIS_LD_U8_I(tab1, s2);
			t5 = VIS_LD_U8_I(tab0, s2);
			t4 = VIS_LD_U8_I(tab3, s1);
			t3 = VIS_LD_U8_I(tab2, s1);
			t2 = VIS_LD_U8_I(tab1, s1);
			t1 = VIS_LD_U8_I(tab0, s1);
			t0 = VIS_LD_U8_I(tab3, s0);
			acc = vis_faligndata(t7, acc);
			acc = vis_faligndata(t6, acc);
			acc = vis_faligndata(t5, acc);
			acc = vis_faligndata(t4, acc);
			acc = vis_faligndata(t3, acc);
			acc = vis_faligndata(t2, acc);
			acc = vis_faligndata(t1, acc);
			acc = vis_faligndata(t0, acc);
			s0 = s2;
			s1 = sp[0];
			s2 = sp[1];
			(*dp++) = acc;
		}

		t7 = VIS_LD_U8_I(tab2, s2);
		t6 = VIS_LD_U8_I(tab1, s2);
		t5 = VIS_LD_U8_I(tab0, s2);
		t4 = VIS_LD_U8_I(tab3, s1);
		t3 = VIS_LD_U8_I(tab2, s1);
		t2 = VIS_LD_U8_I(tab1, s1);
		t1 = VIS_LD_U8_I(tab0, s1);
		t0 = VIS_LD_U8_I(tab3, s0);
		acc = vis_faligndata(t7, acc);
		acc = vis_faligndata(t6, acc);
		acc = vis_faligndata(t5, acc);
		acc = vis_faligndata(t4, acc);
		acc = vis_faligndata(t3, acc);
		acc = vis_faligndata(t2, acc);
		acc = vis_faligndata(t1, acc);
		acc = vis_faligndata(t0, acc);
		s0 = s2;
		(*dp++) = acc;
	}

	dl = (mlib_u8 *)dp;

	if ((xsize & 1) != 0) {
		s1 = sp[0];
		t7 = VIS_LD_U8_I(tab2, s1);
		t6 = VIS_LD_U8_I(tab1, s1);
		t5 = VIS_LD_U8_I(tab0, s1);
		t4 = VIS_LD_U8_I(tab3, s0);
		acc = vis_faligndata(t7, acc);
		acc = vis_faligndata(t6, acc);
		acc = vis_faligndata(t5, acc);
		acc = vis_faligndata(t4, acc);
		*(mlib_f32 *)dl = vis_read_hi(acc);
		dl += 4;
		s0 = s1;
	}

	dl[0] = tab3[s0];
}
void
mlib_v_ImageLookUpSI_S16_U8_2_DstA8D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 **table)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* pointer to start of destination */
	mlib_u16 *dl;

/* pointer to end of destination */
	mlib_u16 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;
	const mlib_u8 *tab0 = &table[0][32768];
	const mlib_u8 *tab1 = &table[1][32768];

	sp = (void *)src;
	dl = (mlib_u16 *)dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 4) {

		s0 = sp[0];
		s1 = sp[1];
		s2 = sp[2];
		s3 = sp[3];
		sp += 4;

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
			t7 = VIS_LD_U8_I(tab1, s3);
			t6 = VIS_LD_U8_I(tab0, s3);
			t5 = VIS_LD_U8_I(tab1, s2);
			t4 = VIS_LD_U8_I(tab0, s2);
			t3 = VIS_LD_U8_I(tab1, s1);
			t2 = VIS_LD_U8_I(tab0, s1);
			t1 = VIS_LD_U8_I(tab1, s0);
			t0 = VIS_LD_U8_I(tab0, s0);
			acc = vis_faligndata(t7, acc);
			acc = vis_faligndata(t6, acc);
			acc = vis_faligndata(t5, acc);
			acc = vis_faligndata(t4, acc);
			acc = vis_faligndata(t3, acc);
			acc = vis_faligndata(t2, acc);
			acc = vis_faligndata(t1, acc);
			acc = vis_faligndata(t0, acc);
			s0 = sp[0];
			s1 = sp[1];
			s2 = sp[2];
			s3 = sp[3];
			(*dp++) = acc;
		}

		t7 = VIS_LD_U8_I(tab1, s3);
		t6 = VIS_LD_U8_I(tab0, s3);
		t5 = VIS_LD_U8_I(tab1, s2);
		t4 = VIS_LD_U8_I(tab0, s2);
		t3 = VIS_LD_U8_I(tab1, s1);
		t2 = VIS_LD_U8_I(tab0, s1);
		t1 = VIS_LD_U8_I(tab1, s0);
		t0 = VIS_LD_U8_I(tab0, s0);
		acc = vis_faligndata(t7, acc);
		acc = vis_faligndata(t6, acc);
		acc = vis_faligndata(t5, acc);
		acc = vis_faligndata(t4, acc);
		acc = vis_faligndata(t3, acc);
		acc = vis_faligndata(t2, acc);
		acc = vis_faligndata(t1, acc);
		acc = vis_faligndata(t0, acc);
		(*dp++) = acc;
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_u16 *)dend - (mlib_u16 *)dp;
		sp += num;

#pragma pipeloop(0)
		for (i = 0; i <= num; i++) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(tab1, s0);
			acc = vis_faligndata(t0, acc);

			t0 = VIS_LD_U8_I(tab0, s0);
			acc = vis_faligndata(t0, acc);
		}

		emask = vis_edge16(dp, dend);
		vis_pst_16(acc, dp, emask);
	}
}
void
mlib_v_ImageLookUpSI_S16_U8_3_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 **table)
{
/* pointer to source data */
	mlib_s16 *sp;

/* pointer to start of destination */
	mlib_u8 *dl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7;

/* destination data */
	mlib_d64 acc0, acc1, acc2;

/* loop variable */
	mlib_s32 i;
	const mlib_u8 *tab0 = &table[0][32768];
	const mlib_u8 *tab1 = &table[1][32768];
	const mlib_u8 *tab2 = &table[2][32768];
	mlib_s32 s00, s01, s02, s03;
	mlib_s32 s10, s11, s12, s13;

	sp = (void *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;

	vis_alignaddr((void *)0, 7);

	i = 0;

	if (xsize >= 8) {

		s00 = sp[0];
		s01 = sp[1];
		s02 = sp[2];
		s03 = sp[3];
		s10 = sp[4];
		s11 = sp[5];
		s12 = sp[6];
		s13 = sp[7];
		sp += 8;

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
			t7 = VIS_LD_U8_I(tab1, s02);
			t6 = VIS_LD_U8_I(tab0, s02);
			t5 = VIS_LD_U8_I(tab2, s01);
			t4 = VIS_LD_U8_I(tab1, s01);
			t3 = VIS_LD_U8_I(tab0, s01);
			t2 = VIS_LD_U8_I(tab2, s00);
			t1 = VIS_LD_U8_I(tab1, s00);
			t0 = VIS_LD_U8_I(tab0, s00);
			acc0 = vis_faligndata(t7, acc0);
			acc0 = vis_faligndata(t6, acc0);
			acc0 = vis_faligndata(t5, acc0);
			acc0 = vis_faligndata(t4, acc0);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			t7 = VIS_LD_U8_I(tab0, s11);
			t6 = VIS_LD_U8_I(tab2, s10);
			t5 = VIS_LD_U8_I(tab1, s10);
			t4 = VIS_LD_U8_I(tab0, s10);
			t3 = VIS_LD_U8_I(tab2, s03);
			t2 = VIS_LD_U8_I(tab1, s03);
			t1 = VIS_LD_U8_I(tab0, s03);
			t0 = VIS_LD_U8_I(tab2, s02);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc1 = vis_faligndata(t1, acc1);
			acc1 = vis_faligndata(t0, acc1);
			t7 = VIS_LD_U8_I(tab2, s13);
			t6 = VIS_LD_U8_I(tab1, s13);
			t5 = VIS_LD_U8_I(tab0, s13);
			t4 = VIS_LD_U8_I(tab2, s12);
			t3 = VIS_LD_U8_I(tab1, s12);
			t2 = VIS_LD_U8_I(tab0, s12);
			t1 = VIS_LD_U8_I(tab2, s11);
			t0 = VIS_LD_U8_I(tab1, s11);
			acc2 = vis_faligndata(t7, acc2);
			acc2 = vis_faligndata(t6, acc2);
			acc2 = vis_faligndata(t5, acc2);
			acc2 = vis_faligndata(t4, acc2);
			acc2 = vis_faligndata(t3, acc2);
			acc2 = vis_faligndata(t2, acc2);
			acc2 = vis_faligndata(t1, acc2);
			acc2 = vis_faligndata(t0, acc2);
			s00 = sp[0];
			s01 = sp[1];
			s02 = sp[2];
			s03 = sp[3];
			s10 = sp[4];
			s11 = sp[5];
			s12 = sp[6];
			s13 = sp[7];
			(*dp++) = acc0;
			(*dp++) = acc1;
			(*dp++) = acc2;
		}

		t7 = VIS_LD_U8_I(tab1, s02);
		t6 = VIS_LD_U8_I(tab0, s02);
		t5 = VIS_LD_U8_I(tab2, s01);
		t4 = VIS_LD_U8_I(tab1, s01);
		t3 = VIS_LD_U8_I(tab0, s01);
		t2 = VIS_LD_U8_I(tab2, s00);
		t1 = VIS_LD_U8_I(tab1, s00);
		t0 = VIS_LD_U8_I(tab0, s00);
		acc0 = vis_faligndata(t7, acc0);
		acc0 = vis_faligndata(t6, acc0);
		acc0 = vis_faligndata(t5, acc0);
		acc0 = vis_faligndata(t4, acc0);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		t7 = VIS_LD_U8_I(tab0, s11);
		t6 = VIS_LD_U8_I(tab2, s10);
		t5 = VIS_LD_U8_I(tab1, s10);
		t4 = VIS_LD_U8_I(tab0, s10);
		t3 = VIS_LD_U8_I(tab2, s03);
		t2 = VIS_LD_U8_I(tab1, s03);
		t1 = VIS_LD_U8_I(tab0, s03);
		t0 = VIS_LD_U8_I(tab2, s02);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc1 = vis_faligndata(t1, acc1);
		acc1 = vis_faligndata(t0, acc1);
		t7 = VIS_LD_U8_I(tab2, s13);
		t6 = VIS_LD_U8_I(tab1, s13);
		t5 = VIS_LD_U8_I(tab0, s13);
		t4 = VIS_LD_U8_I(tab2, s12);
		t3 = VIS_LD_U8_I(tab1, s12);
		t2 = VIS_LD_U8_I(tab0, s12);
		t1 = VIS_LD_U8_I(tab2, s11);
		t0 = VIS_LD_U8_I(tab1, s11);
		acc2 = vis_faligndata(t7, acc2);
		acc2 = vis_faligndata(t6, acc2);
		acc2 = vis_faligndata(t5, acc2);
		acc2 = vis_faligndata(t4, acc2);
		acc2 = vis_faligndata(t3, acc2);
		acc2 = vis_faligndata(t2, acc2);
		acc2 = vis_faligndata(t1, acc2);
		acc2 = vis_faligndata(t0, acc2);
		(*dp++) = acc0;
		(*dp++) = acc1;
		(*dp++) = acc2;
		i += 8;
	}

	dl = (mlib_u8 *)dp;

#pragma pipeloop(0)
	for (; i < xsize; i++) {
		s00 = sp[0];
		dl[0] = tab0[s00];
		dl[1] = tab1[s00];
		dl[2] = tab2[s00];
		dl += 3;
		sp++;
	}
}
void mlib_v_ImageLookUp_S16_U8_124_D1(const mlib_s16 *src,
                                      mlib_u8        *dst,
                                      mlib_s32       xsize,
                                      const mlib_u8  *table0,
                                      const mlib_u8  *table1,
                                      const mlib_u8  *table2,
                                      const mlib_u8  *table3)
{
  mlib_s16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_s32 s4, s5, s6, s7;             /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */

  dl = dst;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;
  sp = (void *)src;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 8) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    s4 = sp[4];
    s5 = sp[5];
    s6 = sp[6];
    s7 = sp[7];
    sp += 8;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
      t7 = VIS_LD_U8_I(table3, s7);
      t6 = VIS_LD_U8_I(table2, s6);
      t5 = VIS_LD_U8_I(table1, s5);
      t4 = VIS_LD_U8_I(table0, s4);
      t3 = VIS_LD_U8_I(table3, s3);
      t2 = VIS_LD_U8_I(table2, s2);
      t1 = VIS_LD_U8_I(table1, s1);
      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      s4 = sp[4];
      s5 = sp[5];
      s6 = sp[6];
      s7 = sp[7];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(table3, s7);
    t6 = VIS_LD_U8_I(table2, s6);
    t5 = VIS_LD_U8_I(table1, s5);
    t4 = VIS_LD_U8_I(table0, s4);
    t3 = VIS_LD_U8_I(table3, s3);
    t2 = VIS_LD_U8_I(table2, s2);
    t1 = VIS_LD_U8_I(table1, s1);
    t0 = VIS_LD_U8_I(table0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_addr) dend - (mlib_addr) dp;
    sp += num;
    num++;

    if ((num & 3) == 1) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num--;
    }
    else if ((num & 3) == 2) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num -= 2;
    }
    else if ((num & 3) == 3) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table2, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num -= 3;
    }

    if (num != 0) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table3, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table2, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge8(dp, dend);
    vis_pst_8(acc, dp, emask);
  }
}
Ejemplo n.º 16
0
static mlib_status
mlib_v_VideoColorYUV2ABGR422_nonalign(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 abgr_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* pointers to src address */
	mlib_u8 *sp2, *sp3, *sl2, *sl3;

/* pointers to src address */
	mlib_u8 *sp1, *sl1;

/* pointers to dst address */
	mlib_u8 *dp, *dl, *dend;

/* all. pointer to y */
	mlib_d64 *spy;

/* all. pointer to dst */
	mlib_d64 *dpp;

/* u, v data */
	mlib_f32 fu0, fu1, fv0, fv1;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi,
		temp_b_lo;
	mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo;
	mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo;
	mlib_d64 dd, dd0, dd1;

/* loop variable */
	mlib_s32 i, j;

/* alpha_ch. is not written */
	mlib_s32 emask = 0x7777;
	mlib_s32 emask1;
	mlib_s32 off;
	mlib_f32 *dfu, *dfv;
	mlib_d64 du0, du1, dv0, dv1;
	mlib_s32 off2, off3;
	mlib_s32 inc;

/*
 * initialize GSR scale factor
 */
	vis_write_gsr(2 << 3);

	sp1 = sl1 = (mlib_u8 *)y;
	sp2 = sl2 = (mlib_u8 *)u;
	sp3 = sl3 = (mlib_u8 *)v;

	dl = dp = (mlib_u8 *)abgr;

/*
 * row loop
 */
	for (j = 0; j < height; j++) {
		spy = (mlib_d64 *)vis_alignaddr(sp1, 0);
		dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
		dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3);
		off2 = (sp2 - (mlib_u8 *)dfu) * 2;
		dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3);
		off3 = (sp3 - (mlib_u8 *)dfv) * 2;

		dend = dp + width * 4 - 1;
		emask1 = vis_edge8(dp, dend);
		i = dp - (mlib_u8 *)dpp;
		emask >>= i;
		inc = (emask1 != 0xff);
		emask1 &= emask;
		off = 8 - i;

		vis_alignaddr((void *)off2, 0);
		fu0 = vis_ld_f32_nf(dfu); dfu++;
		fu1 = vis_ld_f32_nf(dfu); dfu++;
		du0 = vis_fpmerge(fu0, fu0);
		du1 = vis_fpmerge(fu1, fu1);
		du = vis_faligndata(du0, du1);
		du0 = du1;

		vis_alignaddr((void *)off3, 0);
		fv0 = vis_ld_f32_nf(dfv); dfv++;
		fv1 = vis_ld_f32_nf(dfv); dfv++;
		dv0 = vis_fpmerge(fv0, fv0);
		dv1 = vis_fpmerge(fv1, fv1);
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		vis_alignaddr(sp1, 0);
		dy0 = vis_ld_d64_nf(spy); spy++;
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/*
 * 16-pixel column loop
 */
#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			vis_alignaddr((void *)off2, 0);
			fu1 = vis_ld_f32_nf(dfu); dfu++;
			du1 = vis_fpmerge(fu1, fu1);
			du = vis_faligndata(du0, du1);
			du0 = du1;

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);
			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			vis_alignaddr((void *)off3, 0);

			fv1 = vis_ld_f32_nf(dfv); dfv++;
			dv1 = vis_fpmerge(fv1, fv1);
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

			vis_alignaddr((void *)off, 0);
/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dpp += inc;
			inc = 1;

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_hi),
				vis_read_lo(blue_red_hi));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			dd1 = vis_fpmerge(vis_read_hi(x_green_lo),
				vis_read_hi(blue_red_lo));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp++, emask);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			dd0 = vis_fpmerge(vis_read_lo(x_green_lo),
				vis_read_lo(blue_red_lo));
			dd = vis_faligndata(dd1, dd0);
			vis_pst_8(dd, dpp++, emask);

			vis_alignaddr(sp1, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;
			emask1 = emask;
		}

		if (i < width) {

			vis_alignaddr((void *)off, 0);
/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			temp_g_hi = vis_fpadd16(g_hi, y_11644_hi);
			temp_b_hi = vis_fpadd16(b_hi, y_11644_hi);

			green_hi = vis_fpack16(temp_g_hi);
			temp_r_hi = vis_fpadd16(r_hi, y_11644_hi);

			blue_hi = vis_fpack16(temp_b_hi);
			temp_g_lo = vis_fpadd16(g_lo, y_11644_lo);

			red_hi = vis_fpack16(temp_r_hi);
			temp_b_lo = vis_fpadd16(b_lo, y_11644_lo);

			green_lo = vis_fpack16(temp_g_lo);
			temp_r_lo = vis_fpadd16(r_lo, y_11644_lo);

			blue_lo = vis_fpack16(temp_b_lo);

			x_green_hi = vis_fmul8x16au(green_hi, k5);

			red_lo = vis_fpack16(temp_r_lo);
			blue_red_hi = vis_fpmerge(blue_hi, red_hi);

			x_green_lo = vis_fmul8x16au(green_lo, k5);
			blue_red_lo = vis_fpmerge(blue_lo, red_lo);

			dd1 = vis_fpmerge(vis_read_hi(x_green_hi),
				vis_read_hi(blue_red_hi));
			dd = vis_faligndata(dd0, dd1);
			vis_pst_8(dd, dpp, emask1);
			dd0 = dd1;
			dpp += inc;

			i += 2;

			if (i < width) {

				dd1 = vis_fpmerge(vis_read_lo(x_green_hi),
					vis_read_lo(blue_red_hi));
				dd = vis_faligndata(dd0, dd1);
				vis_pst_8(dd, dpp++, emask);
				dd0 = dd1;
				i += 2;

				if (i < width) {
					dd1 = vis_fpmerge(vis_read_hi
						(x_green_lo),
						vis_read_hi(blue_red_lo));
					dd = vis_faligndata(dd0, dd1);
					vis_pst_8(dd, dpp++, emask);
					dd0 = dd1;
				}
			}
		}

		vis_alignaddr((void *)off, 0);
		emask1 = vis_edge8(dpp, dend);
		emask1 &= emask;
		dd = vis_faligndata(dd0, dd1);
		vis_pst_8(dd, dpp, emask1);

		sp1 = sl1 = sl1 + y_stride;
		sp2 = sl2 = sl2 + uv_stride;
		sp3 = sl3 = sl3 + uv_stride;

		dl = dp = dl + abgr_stride;
		emask = 0x7777;
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 17
0
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
Ejemplo n.º 18
0
mlib_status
__mlib_VideoInterpAveX_U8_U8_16x16(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_d64 s0, s1, s2, s3, s4, s5, s6;
    mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3;
    mlib_d64 *sd, *dd;
    mlib_d64 dzero = vis_fzero();
    const mlib_f32 fm2 = vis_to_float(0x1000200);
    mlib_f32 fzero = vis_read_hi(dzero);
    mlib_d64 rounder = vis_fpsub16(dzero, vis_fone());
    mlib_s32 y;

    rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder);
    vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7));
    dd = (mlib_d64 *)curr_block;
    sd = (mlib_d64 *)((mlib_addr)ref_block & ~7);

    y = 8;

    if (((mlib_s32)(ref_block + 1) & 7)) {
        do {
            s0 = sd[0];
            s1 = sd[1];
            s2 = sd[2];
            sd0 = vis_faligndata(s0, s1);
            sd1 = vis_faligndata(s1, s2);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            s4 = sd[0];
            s5 = sd[1];
            s6 = sd[2];
            sd2 = vis_faligndata(s4, s5);
            sd3 = vis_faligndata(s5, s6);
            vis_alignaddr((void *)(ref_block + 1), 0);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            d0 = dd[0];
            d1 = dd[1];
            d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0];
            d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1];
            s0 = vis_faligndata(s0, s1);
            s1 = vis_faligndata(s1, s2);
            s2 = vis_faligndata(s4, s5);
            s3 = vis_faligndata(s5, s6);

            MLIB_V_VIDEOINTERPAVG(d0, sd0, s0);
            MLIB_V_VIDEOINTERPAVG(d1, sd1, s1);
            MLIB_V_VIDEOINTERPAVG(d2, sd2, s2);
            MLIB_V_VIDEOINTERPAVG(d3, sd3, s3);

            dd[0] = d0;
            dd[1] = d1;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            dd[0] = d2;
            dd[1] = d3;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            vis_alignaddr((void *)ref_block, 0);
        } while (--y);
    } else {
        do {
            s0 = sd[0];
            s1 = sd[1];
            s2 = sd[2];
            sd0 = vis_faligndata(s0, s1);
            sd1 = vis_faligndata(s1, s2);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            s4 = sd[0];
            s5 = sd[1];
            s6 = sd[2];
            sd2 = vis_faligndata(s4, s5);
            sd3 = vis_faligndata(s5, s6);
            sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
            d0 = dd[0];
            d1 = dd[1];
            d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0];
            d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1];

            MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1);
            MLIB_V_VIDEOINTERPAVG(d1, sd1, s2);
            MLIB_V_VIDEOINTERPAVG(d2, sd2, s5);
            MLIB_V_VIDEOINTERPAVG(d3, sd3, s6);

            dd[0] = d0;
            dd[1] = d1;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
            dd[0] = d2;
            dd[1] = d3;
            dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
        } while (--y);
    }
    return (MLIB_SUCCESS);
}
Ejemplo n.º 19
0
mlib_status
mlib_v_ImageAdd_S16(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;

	VALIDATE(mlib_s16);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 4); k += 4) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 20
0
mlib_status
__mlib_VideoInterpAveX_U8_U8(
    mlib_u8 *curr_block,
    const mlib_u8 *ref_block,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 frame_stride,
    mlib_s32 field_stride)
{
    mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7;
    mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3;
    mlib_d64 *sd, *dd;
    mlib_d64 dzero = vis_fzero();
    const mlib_f32 fm2 = vis_to_float(0x1000200);
    mlib_f32 fzero = vis_read_hi(dzero);
    mlib_d64 rounder = vis_fpsub16(dzero, vis_fone());
    mlib_s32 y;

    rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder);
    vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7));
    dd = (mlib_d64 *)curr_block;
    sd = (mlib_d64 *)((mlib_addr)ref_block & ~7);

    if (width == 8) {
        y = height >> 2;

        if (((mlib_s32)(ref_block + 1) & 7)) {
            do {
                s0 = sd[0];
                s1 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd0 = vis_faligndata(s0, s1);
                s2 = sd[0];
                s3 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd1 = vis_faligndata(s2, s3);
                s4 = sd[0];
                s5 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd2 = vis_faligndata(s4, s5);
                s6 = sd[0];
                s7 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd3 = vis_faligndata(s6, s7);
                vis_alignaddr((void *)(ref_block + 1), 0);
                d0 = *dd;
                d1 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   field_stride);
                d2 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   2 * field_stride);
                d3 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   3 * field_stride);
                s0 = vis_faligndata(s0, s1);
                s1 = vis_faligndata(s2, s3);
                s2 = vis_faligndata(s4, s5);
                s3 = vis_faligndata(s6, s7);

                MLIB_V_VIDEOINTERPAVG(d0, sd0, s0);
                MLIB_V_VIDEOINTERPAVG(d1, sd1, s1);
                MLIB_V_VIDEOINTERPAVG(d2, sd2, s2);
                MLIB_V_VIDEOINTERPAVG(d3, sd3, s3);

                *dd = d0;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d1;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d2;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d3;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

                vis_alignaddr((void *)ref_block, 0);
            } while (--y);
        } else {
            do {
                s0 = sd[0];
                s1 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd0 = vis_faligndata(s0, s1);
                s2 = sd[0];
                s3 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd1 = vis_faligndata(s2, s3);
                s4 = sd[0];
                s5 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd2 = vis_faligndata(s4, s5);
                s6 = sd[0];
                s7 = sd[1];
                sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
                sd3 = vis_faligndata(s6, s7);
                d0 = *dd;
                d1 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   field_stride);
                d2 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   2 * field_stride);
                d3 = *(mlib_d64 *)((mlib_u8 *)dd +
                                   3 * field_stride);

                MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1);
                MLIB_V_VIDEOINTERPAVG(d1, sd1, s3);
                MLIB_V_VIDEOINTERPAVG(d2, sd2, s5);
                MLIB_V_VIDEOINTERPAVG(d3, sd3, s7);

                *dd = d0;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d1;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d2;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
                *dd = d3;
                dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

            } while (--y);
        }
    } else {
Ejemplo n.º 21
0
mlib_status
__mlib_VectorConvert_S16_S32_Sat(
	mlib_s16 *z,
	const mlib_s32 *x,
	mlib_s32 n)
{
	mlib_s32 *src = (void *)x;
	mlib_s16 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8;
	mlib_s32 c;
	mlib_s32 len_64, even_length, rest_64, length = n, i;

	if (n < 16) {
		PACK_S_S(mlib_s32, mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN);
	}

/*
 * First try to align destination address for 8 bytes.
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = *src) > MLIB_S16_MAX ? MLIB_S16_MAX
			: (c < MLIB_S16_MIN ? MLIB_S16_MIN : c);
		src++;
		length--;
	}

	vis_write_gsr(16 << 3);

	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;
	ddst = (mlib_d64 *)dst;

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d1),
				vis_fpackfix(d2));
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = (*dsrc++);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d1),
				vis_fpackfix(d2));
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d3),
				vis_fpackfix(d4));
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d4 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d4;
			d2 = (*dsrc++);
			d4 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d1, d2);
			d6 = vis_faligndata(d2, d4);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d5),
				vis_fpackfix(d6));
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d0 = d4;
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = (*dsrc++);
			d4 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d0, d1);
			d6 = vis_faligndata(d1, d2);
			d7 = vis_faligndata(d2, d3);
			d8 = vis_faligndata(d3, d4);
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d5),
				vis_fpackfix(d6));
			(*ddst++) =
				vis_freg_pair(vis_fpackfix(d7),
				vis_fpackfix(d8));
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c > MLIB_S16_MAX ? MLIB_S16_MAX
			: (c < MLIB_S16_MIN ? MLIB_S16_MIN : c);
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 22
0
/* The case of even address of vector x */
static void
mlib_VectorDotProd_U8C_al_x(
	mlib_d64 *z,
	const void *x,
	const void *y,
	mlib_s32 n)
{
	mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y;
	mlib_d64 sum_r = 0.0, sum_i = 0.0;
	mlib_d64 *dpx, *dpy, *dpxend;
	mlib_d64 dx, dy, dy0, dy1;
	mlib_d64 dx_r, dy_r, dy_i;
	mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl;
	mlib_d64 d_ih, d_il, d_rh, d_rl;
	mlib_d64 ds_r, ds_i, ds1_r, ds1_i;
	mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 edge[2], fzero = vis_fzero();
	mlib_f32 fsum;
	mlib_s32 d_left;
	mlib_s32 emask, off;

	edge[0] = edge[1] = 0;

	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	off = (mlib_addr)dpx - (mlib_addr)px;
	dpy = vis_alignaddr((void *)py, off);
	pxend = px + n + n - 1;
	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	emask = vis_edge8(px, pxend);
	vis_pst_8(dpx[0], edge, emask);
	dx = edge[0];
	dy = vis_ld_d64_nf(dpy);

	if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) {
		vis_write_bmask(0x781A3C5E, 0);
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C0;
				SUM_U8C;
				dx = dpx[1];
				dy = dpy[1];
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	} else {
		mlib_s32 mask = ((mlib_addr)(py + off)) & 7;

		vis_write_bmask(0x11111111 * mask, 0x01234567);
		dy1 = vis_ld_d64_nf(dpy+1);
		dy = vis_bshuffle(dy, dy1);
		SET_ALIGN_U8C;
		while ((mlib_addr)dpx < (mlib_addr)dpxend) {
			d_left = dpxend - dpx;

			if (d_left > MAX_LOOP)
				d_left = MAX_LOOP;
			ds_i = ds_r = ds1_i = ds1_r = 0.0;
#pragma pipeloop(0)
			for (; d_left > 0; d_left--) {
				DPROD_U8C;
				SUM_U8C;
				dy0 = dy1;
				dy1 = vis_ld_d64_nf(dpy+2);
				dx = vis_ld_d64_nf(dpx+1);
				dy = vis_bshuffle(dy0, dy1);
				dpx++;
				dpy++;
			}

			ds_i = vis_fpadd32(ds_i, ds1_i);
			ds_r = vis_fpadd32(ds_r, ds1_r);
			fsum = vis_read_hi(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_r);
			sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_hi(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
			fsum = vis_read_lo(ds_i);
			sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		}
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge8(dpx, pxend);
		vis_pst_8(dx, edge + 1, emask);
		dx = edge[1];
		SET_ALIGN_U8C;
		DPROD_U8C;
		SUM_U8C_TAIL;
		fsum = vis_read_hi(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_r);
		sum_r += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_hi(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
		fsum = vis_read_lo(ds_i);
		sum_i += (mlib_d64)*((mlib_s32 *)&fsum);
	}

	z[0] = sum_r;
	z[1] = sum_i;
#undef MAX_LOOP
}
Ejemplo n.º 23
0
mlib_status
__mlib_VectorConvert_U8_S8_Sat(
	mlib_u8 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *src = (void *)x;
	mlib_u8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6;
	mlib_s32 len_64, even_length, rest_64, length = n, i, off;
	mlib_s8 c;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 zero = vis_fzeros();

	if (length < 16) {
		PACK_S_U(mlib_s8, mlib_u8);
	}

/*
 * First, try to align destination address for 8 bytes .
 */

	while ((mlib_addr)dst & 7) {
		(*dst++) = (c = (*src++)) < 0 ? 0 : c;
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr(7 << 3);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

/*
 * Source address is also 8-byte aligned.
 */

		dsrc = (mlib_d64 *)src;

/*
 * Peeling the 1st iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}

/*
 * Then loop with step==2. Unroll for 2 iterations.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
			d1 = (*dsrc++);
			d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero),
				four_16_ones);
			d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero),
				four_16_ones);
			(*ddst++) = vis_fpack16_pair(d2, d3);
		}
	} else {

/*
 * Source address has arbitrary alignment. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		off = (mlib_addr)src & 7;
		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}

/*
 * Then loop with step==2.
 */
#pragma pipeloop(0)
#pragma unroll(4)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d3 = vis_bshuffle(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d6 = vis_bshuffle(d1, d2);
			d4 = vis_fmul8sux16(d3, four_16_ones);
			d3 = vis_faligndata(d3, d3);
			d5 = vis_fmul8sux16(d3, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
			d4 = vis_fmul8sux16(d6, four_16_ones);
			d6 = vis_faligndata(d6, d6);
			d5 = vis_fmul8sux16(d6, four_16_ones);
			(*ddst++) = vis_fpack16_pair(d4, d5);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c;

	return (MLIB_SUCCESS);
}
void mlib_v_ImageLookUp_S16_U16_124_D1(const mlib_s16 *src,
                                       mlib_u16       *dst,
                                       mlib_s32       xsize,
                                       const mlib_u16 *table0,
                                       const mlib_u16 *table1,
                                       const mlib_u16 *table2,
                                       const mlib_u16 *table3)
{
  mlib_s16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_u16 *dl;                        /* pointer to start of destination */
  mlib_u16 *dend;                      /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, acc0;                   /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */

  dl = dst;
  sp = (void *)src;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 6);

  i = 0;

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t3 = VIS_LD_U16_I(table3, 2 * s3);
      t2 = VIS_LD_U16_I(table2, 2 * s2);
      t1 = VIS_LD_U16_I(table1, 2 * s1);
      t0 = VIS_LD_U16_I(table0, 2 * s0);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc0;
    }

    t3 = VIS_LD_U16_I(table3, 2 * s3);
    t2 = VIS_LD_U16_I(table2, 2 * s2);
    t1 = VIS_LD_U16_I(table1, 2 * s1);
    t0 = VIS_LD_U16_I(table0, 2 * s0);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16 *) dend - (mlib_u16 *) dp;
    sp += num;
    num++;

    if (num == 1) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table0, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);
    }
    else if (num == 2) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table1, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table0, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);
    }
    else if (num == 3) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table2, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table1, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U16_I(table0, 2 * s0);
      acc0 = vis_faligndata(t0, acc0);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc0, dp, emask);
  }
}
Ejemplo n.º 25
0
mlib_status
__mlib_VectorConvert_S8_S16_Sat(
	mlib_s8 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *src = (void *)x;
	mlib_s8 *dst = z;
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, d5, d6, d7;
	mlib_s32 len_64, even_length, rest_64, length = n, i;
	mlib_s16 c;

	if (n < 16) {
		PACK_S_S(mlib_s16, mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN);
	}

/*
 * First try to align destination address for 8 bytes .
 */

	while ((mlib_s32)dst & 7) {
		(*dst++) = (c =
			(*src++)) < MLIB_S8_MIN ? MLIB_S8_MIN : (c >
			MLIB_S8_MAX ? MLIB_S8_MAX : c);
		length--;
	}

	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	ddst = (mlib_d64 *)dst;
	vis_write_gsr64(((mlib_u64)0x082A4C6E << 32) | (8 << 3) | 2);

/*
 * Now analyze source address alignment.
 */

	if (((mlib_addr)src & 7) == 0) {

		dsrc = (mlib_d64 *)src;

		if (i = (len_64 & 1)) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
		}
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i < len_64; i += 2) {
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
			d1 = (*dsrc++);
			d2 = (*dsrc++);
			d3 = vis_fpackfix_pair(d1, d2);
			d1 = vis_faligndata(d1, d1);
			d2 = vis_faligndata(d2, d2);
			d4 = vis_fpackfix_pair(d1, d2);
			(*ddst++) = vis_bshuffle(d3, d4);
		}
	} else {

/*
 * Source address is arbitrary aligned. Use vis_alignaddr() and
 * vis_faligndata() functions.
 */

		dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
		d2 = (*dsrc++);

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d5 = vis_faligndata(d1, d2);

			d3 = vis_fpackfix_pair(d4, d5);
			d4 = vis_fpack32(d4, d4);
			d4 = vis_fpack32(d4, d4);
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5));
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d4 = vis_fpackfix_pair(d4, d5);
			(*ddst++) = vis_bshuffle(d3, d4);
		}

/*
 * Then loop with step==2.
 */

#pragma pipeloop(0)
#pragma unroll(2)
		for (i; i < len_64; i += 2) {
			d1 = d2;
			d2 = (*dsrc++);
			d4 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d5 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = (*dsrc++);
			d6 = vis_faligndata(d1, d2);
			d1 = d2;
			d2 = vis_ld_d64_nf(dsrc); dsrc++;
			d7 = vis_faligndata(d1, d2);

			d3 = vis_fpackfix_pair(d4, d5);
			d4 = vis_fpack32(d4, d4);
			d4 = vis_fpack32(d4, d4);
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5));
			d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5));
			d4 = vis_fpackfix_pair(d4, d5);

			d5 = vis_fpackfix_pair(d6, d7);
			d6 = vis_fpack32(d6, d6);
			d6 = vis_fpack32(d6, d6);
			d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7));
			d7 = vis_fpmerge(vis_read_lo(d7), vis_read_hi(d7));
			d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7));
			d6 = vis_fpackfix_pair(d6, d7);

			(*ddst++) = vis_bshuffle(d3, d4);
			(*ddst++) = vis_bshuffle(d5, d6);
		}
	}

	for (i = 0; i < rest_64; i++) {
		c = src[even_length + i];
		dst[even_length + i] = c < MLIB_S8_MIN ? MLIB_S8_MIN
			: (c > MLIB_S8_MAX ? MLIB_S8_MAX : c);
	}

	return (MLIB_SUCCESS);
}
Ejemplo n.º 26
0
void ADD_SUFF(ThreeByteBgrToIntArgbScaleConvert)(SCALE_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, maskFF;
    mlib_s32 i, i0, i1, j;

    if (width < 16) {
	for (j = 0; j < height; j++) {
	    mlib_u8  *src = srcBase;
	    mlib_s32 *dst = dstBase;
	    mlib_s32 *dst_end = dst + width;
	    mlib_s32 tmpsxloc = sxloc;

	    PTR_ADD(src, (syloc >> shift) * srcScan);

	    for (; dst < dst_end; dst++) {
		i = tmpsxloc >> shift;
		tmpsxloc += sxinc;
		*(mlib_s32*)dst = GBR_PIXEL(i);
	    }

	    PTR_ADD(dstBase, dstScan);
	    syloc += syinc;
	}
	return;    
    }

    maskFF = vis_fone();

    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
	mlib_u8  *src = srcBase;
	mlib_f32 *dst = dstBase;
	mlib_f32 *dst_end = dst + width;
	mlib_s32 tmpsxloc = sxloc;

	PTR_ADD(src, (syloc >> shift) * srcScan);

	if ((mlib_s32)dst & 7) {
	    i = tmpsxloc >> shift;
	    tmpsxloc += sxinc;
	    *(mlib_s32*)dst = GBR_PIXEL(i);
	    dst++;
	}

#pragma pipeloop(0)
	for (; dst <= dst_end - 2; dst += 2) {
	    i0 = tmpsxloc >> shift;
	    i1 = (tmpsxloc + sxinc) >> shift;
	    tmpsxloc += 2*sxinc;

	    dd = vis_faligndata(vis_ld_u8(src + 3*i1    ), dd);
	    dd = vis_faligndata(vis_ld_u8(src + 3*i1 + 1), dd);
	    dd = vis_faligndata(vis_ld_u8(src + 3*i1 + 2), dd);
	    dd = vis_faligndata(maskFF, dd);
	    dd = vis_faligndata(vis_ld_u8(src + 3*i0    ), dd);
	    dd = vis_faligndata(vis_ld_u8(src + 3*i0 + 1), dd);
	    dd = vis_faligndata(vis_ld_u8(src + 3*i0 + 2), dd);
	    dd = vis_faligndata(maskFF, dd);

	    *(mlib_d64*)dst = dd;
	}

	for (; dst < dst_end; dst++) {
	    i = tmpsxloc >> shift;
	    tmpsxloc += sxinc;
	    *(mlib_s32*)dst = GBR_PIXEL(i);
	}

	PTR_ADD(dstBase, dstScan);
	syloc += syinc;
    }
Ejemplo n.º 27
0
mlib_status
__mlib_VideoDCT8x8Quantize_S16_S16_B12_NA(
    mlib_s16 coeffs[64],
    const mlib_s16 *block,
    const mlib_d64 qtable[64])
{
    mlib_d64 *sp = (mlib_d64 *)block;
    mlib_d64 *dp = (mlib_d64 *)coeffs;

    mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70;
    mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71;
    mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90;
    mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91;
    mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70;
    mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71;
    mlib_f32 FCOS, c17, c26, c35, c_4;
    mlib_s32 mask;
    mlib_d64 w_const = vis_to_double_dup(0x4000);

    if (block == NULL || coeffs == NULL)
        return (MLIB_FAILURE);

    if (!(((mlib_addr)block | (mlib_addr)coeffs) & 7)) {
        return (__mlib_VideoDCT8x8Quantize_S16_S16_B12(coeffs,
                block, qtable));
    }

    vis_write_gsr(1 << 3);
    /*
     * first stage
     */

    LOAD_DATA_GE_INTER1;

    TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30);
    TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70);
    LOADCONSTS4_12;

    PREPARE_DATA_INTER(0);

    LOAD_DATA_GE_INTER2;
    TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31);

    COMPUTING_DATA(0);

    TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71);
    PREPARE_DATA_INTER(1);
    COMPUTING_DATA(1);

    /*
     * second stage
     */


    TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70);
    TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30);
    PREPARE_DATA_INTER(0);
    TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31);
    COMPUTING_DATA_12(0);

    TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71);
    ENDSCALE_12(0);


    dp = (mlib_d64 *)vis_alignaddr(coeffs, -1);
    mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp);
    vis_alignaddrl((void *)coeffs, 0);

    PREPARE_DATA_INTER(1);
    COMPUTING_DATA_12(1);

    ENDSCALE_12(1);

    Quant_ST_NA(d00, d00, qtable[0]);
    Quant_ST_NA(d01, d01, qtable[1]);
    Quant_ST_NA(d10, d10, qtable[2]);
    Quant_ST_NA(d11, d11, qtable[3]);
    Quant_ST_NA(d20, d20, qtable[4]);
    Quant_ST_NA(d21, d21, qtable[5]);
    Quant_ST_NA(d30, d30, qtable[6]);
    Quant_ST_NA(d31, d31, qtable[7]);
    Quant_ST_NA(d40, d40, qtable[8]);
    Quant_ST_NA(d41, d41, qtable[9]);
    Quant_ST_NA(d50, d50, qtable[10]);
    Quant_ST_NA(d51, d51, qtable[11]);
    Quant_ST_NA(d60, d60, qtable[12]);
    Quant_ST_NA(d61, d61, qtable[13]);
    Quant_ST_NA(d70, d70, qtable[14]);
    Quant_ST_NA(d71, d71, qtable[15]);

    dp[1] = vis_faligndata(d00, d01);
    dp[2] = vis_faligndata(d01, d10);
    dp[3] = vis_faligndata(d10, d11);
    dp[4] = vis_faligndata(d11, d20);
    dp[5] = vis_faligndata(d20, d21);
    dp[6] = vis_faligndata(d21, d30);
    dp[7] = vis_faligndata(d30, d31);
    dp[8] = vis_faligndata(d31, d40);
    dp[9] = vis_faligndata(d40, d41);
    dp[10] = vis_faligndata(d41, d50);
    dp[11] = vis_faligndata(d50, d51);
    dp[12] = vis_faligndata(d51, d60);
    dp[13] = vis_faligndata(d60, d61);
    dp[14] = vis_faligndata(d61, d70);
    dp[15] = vis_faligndata(d70, d71);
    vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask);

    if ((mlib_addr)coeffs & 7)
        vis_pst_8(vis_faligndata(d00, d00), dp, mask);

    return (MLIB_SUCCESS);
}
Ejemplo n.º 28
0
void ADD_SUFF(ThreeByteBgrToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 *sp;
    mlib_d64 s_0;
    mlib_d64 s0, s1, s2, s3, sd0, sd1, sd2, dd0, dd1, dd2, dd3;
    mlib_s32 i, i0, j;

    if (width < 16) {
	for (j = 0; j < height; j++) {
	    mlib_u8  *src = srcBase;
	    mlib_s32 *dst = dstBase;

	    for (i = 0; i < width; i++) {
		dst[i] = GBR_PIXEL(i);
	    }

	    PTR_ADD(dstBase, dstScan);
	    PTR_ADD(srcBase, srcScan);
	}
	return;    
    }

    if (srcScan == 3*width && dstScan == 4*width) {
	width *= height;
	height = 1;
    }

    s_0 = vis_fone();

    for (j = 0; j < height; j++) {
	mlib_u8  *src = srcBase;
	mlib_f32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    ((mlib_s32*)dst)[i] = GBR_PIXEL(i);
	    i0 = 1;
	}

	sp = vis_alignaddr(src, 3*i0);
	s3 = *sp++;

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 8; i += 8) {
	    s0 = s3;
	    s1 = *sp++;
	    s2 = *sp++;
	    s3 = *sp++;
	    sd0 = vis_faligndata(s0, s1);
	    sd1 = vis_faligndata(s1, s2);
	    sd2 = vis_faligndata(s2, s3);

	    BGR_TO_ARGB

	    *(mlib_d64*)(dst + i    ) = dd0;
	    *(mlib_d64*)(dst + i + 2) = dd1;
	    *(mlib_d64*)(dst + i + 4) = dd2;
	    *(mlib_d64*)(dst + i + 6) = dd3;
	}

	for (; i < width; i++) {
	    ((mlib_s32*)dst)[i] = GBR_PIXEL(i);
	}

	PTR_ADD(dstBase, dstScan);
	PTR_ADD(srcBase, srcScan);
    }
}
Ejemplo n.º 29
0
mlib_status
__mlib_VectorSubS_S16C_S16C_Sat(
	mlib_s16 *z,
	const mlib_s16 *x,
	const mlib_s16 *c,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx;
	mlib_d64 dx, dz, dx0, dx1, dr0, dr1, dr2;
	mlib_s16 *pz, *px, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;
	mlib_s32 mask1, mask2;
	mlib_s32 ovl, und;
	mlib_u16 uc0 = *((mlib_s16 *)c);
	mlib_u16 uc1 = *((mlib_s16 *)c + 1);
	mlib_d64 dc = ((mlib_addr)z & 2) ? vis_to_double_dup((uc1 << 16) | uc0)
		: vis_to_double_dup((uc0 << 16) | uc1);
	mlib_d64 fzero = vis_fzero();
	mlib_d64 const_ovl = vis_to_double_dup(0x7fff7fff);
	mlib_d64 const_und = vis_fnot(const_ovl);
	mlib_s32 len = n + n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s16 *)x;
	pz = (mlib_s16 *)z;

/*
 * prepare the destination address
 */

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge16(pz, pzend);

/*
 * prepare the destination address
 */

	if (off) {
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		SUBS16_SAT;

		px += (8 + off) >> 1;
		len -= (8 + off) >> 1;
		dpz++;
	}

	if (len <= 0)
		return (MLIB_SUCCESS);

	even_8 = len >> 2;
	rest_8 = len & 0x3;
	emask = 0xf;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if (!((mlib_addr)px & 7)) {

		dpx = (mlib_d64 *)px;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			SUBS16_SAT;
			dpz++;
		}

		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

	} else {

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUBS16_SAT;
			dx0 = dx1;
			dpz++;
		}

		dx1 = dx0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

/*
 * prepare edge mask for the last bytes
 */

	emask = ~(vis_edge16((void *)(rest_8 << 1), pzend));

	vis_alignaddr(px, 0);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_faligndata(dx0, dx1);

	SUBS16_SAT;

	return (MLIB_SUCCESS);
}
Ejemplo n.º 30
0
mlib_status
mlib_v_conv5x5_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3, *sbuf4, *sbuf5;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf, *dbuf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d3, d4, d5;

/* data */
	mlib_d64 d11, d12, d13, d14, d15;

/* data */
	mlib_d64 d21, d22, d23, d24, d25;

/* data */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;
	mlib_f32 k1k2, k3k4, k5k6, k7k8;
	mlib_f32 k9k10, k11k12, k13k14, k15k16;
	mlib_f32 k17k18, k19k20, k21k22, k23k24, k25;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 4;
	dw *= 4;
	dh -= 4;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	sa3 = sa2 + slb;
	sa4 = sa3 + slb;
	d_a = adr_dst + 2 * dlb + 8;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf4, sa2);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf5, sa3);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf5, sa4);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER_NF(16);

		vis_alignaddr(s1, 4);
		dbuf1 = dbuf;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		d11 = *(s1 + 1);
		d12 = *(s2 + 1);
		d13 = *(s3 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d21 = *(s1 + 2);
			d22 = *(s2 + 2);
			d23 = *(s3 + 2);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			dt_1 = vis_faligndata(d1, d11);
			dt_2 = vis_faligndata(d2, d12);
			dt_3 = vis_faligndata(d3, d13);
			CONV_AL(dt_1, k1k2);
			CONV_AU(dt_2, k7k8);
			CONV_AL(dt_3, k11k12);
			CONV_AU(d11, k3k4);
			CONV_AL(d12, k7k8);
			CONV_AU(d13, k13k14);
			dt_1 = vis_faligndata(d11, d21);
			dt_2 = vis_faligndata(d12, d22);
			dt_3 = vis_faligndata(d13, d23);
			CONV_AL(dt_1, k3k4);
			CONV_AU(dt_2, k9k10);
			CONV_AL(dt_3, k13k14);
			CONV_AU(d21, k5k6);
			CONV_AL(d22, k9k10);
			CONV_AU(d23, k15k16);
			dbuf1[0] = out0;
			dbuf1[1] = out1;
			dbuf1 += 2;
			d1 = d11;
			d2 = d12;
			d3 = d13;
			d11 = d21;
			d12 = d22;
			d13 = d23;
			s1++;
			s2++;
			s3++;
		}

		dbuf1 = dbuf;
		d4 = *s4;
		d5 = *s5;
		d14 = *(s4 + 1);
		d15 = *(s5 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d24 = *(s4 + 2);
			d25 = *(s5 + 2);
			out0 = dbuf1[0];
			out1 = dbuf1[1];
			CONV_AL(d4, k15k16);
			CONV_AU(d5, k21k22);
			dt_4 = vis_faligndata(d4, d14);
			dt_5 = vis_faligndata(d5, d15);
			CONV_AU(dt_4, k17k18);
			CONV_AL(dt_5, k21k22);
			CONV_AL(d14, k17k18);
			CONV_AU(d15, k23k24);
			dt_4 = vis_faligndata(d14, d24);
			dt_5 = vis_faligndata(d15, d25);
			CONV_AU(dt_4, k19k20);
			CONV_AL(dt_5, k23k24);
			CONV_AL(d24, k19k20);
			CONV_AU(d25, k25);
			dbuf1 += 2;
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d4 = d14;
			d5 = d15;
			d14 = d24;
			d15 = d25;
			s4++;
			s5++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa4 = sa4 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}