mlib_status
__mlib_VideoColorSplit2_S16(
	mlib_s16 *color1,
	mlib_s16 *color2,
	const mlib_s16 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 sd0, sd1, dd0, dd1, dd2, dd3;
	mlib_s32 i;

	vis_write_gsr64(((mlib_u64)0x014589cd << 32) | 2);
/*
 * 8-pixels loop
 */
	sd0 = sp[0];
	sd1 = vis_ld_d64_nf(sp + 1);
	dd0 = vis_faligndata(sd0, sd1);
	dd1 = vis_faligndata(sd1, sd0);
	dd2 = vis_bshuffle(sd0, sd1);
	dd3 = vis_bshuffle(dd0, dd1);
	sd0 = vis_ld_d64_nf(sp + 2);
	sd1 = vis_ld_d64_nf(sp + 3);
	dd0 = vis_faligndata(sd0, sd1);
	dd1 = vis_faligndata(sd1, sd0);
#pragma pipeloop(0)
	for (i = 0; i < (n / 4); i++) {
		(*dp0++) = dd2;
		(*dp1++) = dd3;
		dd2 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(dd0, dd1);
		sd0 = vis_ld_d64_nf(sp + 4);
		sd1 = vis_ld_d64_nf(sp + 5);
		dd0 = vis_faligndata(sd0, sd1);
		dd1 = vis_faligndata(sd1, sd0);
		sp += 2;
	}

/*
 * last 8 pixels
 */

	if (n & 3) {
		mlib_s32 emask = 0xF0 >> (n & 3);

		sd0 = sp[0];
		sd1 = vis_ld_d64_nf(sp + 1);
		dd0 = vis_faligndata(sd0, sd1);
		dd1 = vis_faligndata(sd1, sd0);
		dd2 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(dd0, dd1);
		vis_pst_16(dd2, (mlib_f32 *)dp0, emask);
		vis_pst_16(dd3, (mlib_f32 *)dp1, emask);
	}
mlib_status
__mlib_VideoUpSample420_Nearest_S16(
	mlib_s16 *dst0,
	mlib_s16 *dst1,
	const mlib_s16 *src,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)src;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_s16 *dend = dst0 + 2 * n - 1;
	mlib_d64 sa, da, dr, dr1;
	mlib_s32 emask, i;

	if (n <= 0)
		return (MLIB_FAILURE);

#pragma pipeloop(0)
	for (i = 0; i <= (n - 4); i += 4) {
		sa = sp[0];
		sp++;
		dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));
		dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));
		dp0[0] = dp1[0] =
			vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));
		dp0[1] = dp1[1] =
			vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));
		dp0 += 2;
		dp1 += 2;
	}

	if ((mlib_s16 *)dp0 <= dend) {
		sa = sp[0];
		dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));
		dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));
		da = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));
		emask = vis_edge16(dp0, dend);
		vis_pst_16(da, dp0, emask);
		vis_pst_16(da, dp1, emask);
		dp0++;
		dp1++;

		if ((mlib_s16 *)dp0 <= dend) {
			da = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));
			emask = vis_edge16(dp0, dend);
			vis_pst_16(da, dp0, emask);
			vis_pst_16(da, dp1, emask);
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorSplit3_S16(
	mlib_s16 *color1,
	mlib_s16 *color2,
	mlib_s16 *color3,
	const mlib_s16 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 *dp2 = (mlib_d64 *)color3;
	mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3;
	mlib_s32 i;

	vis_write_gsr(4);
	vis_write_bmask(0x02CE13DF, 0);
#pragma pipeloop(0)
#pragma unroll(4)
	for (i = 0; i <= (n - 4); i += 4) {
		sd0 = sp[0];
		sd1 = sp[1];
		sd2 = sp[2];
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		(*dp0++) = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		(*dp1++) = vis_bshuffle(dd3, dd3);
		(*dp2++) = vis_bshuffle(dd1, dd2);
		sp += 3;
	}

/*
 * last 4 pixels
 */

	if (i < n) {
		mlib_s32 emask = 0xF0 >> (n & 3);
		mlib_d64 st0, st1, st2;

		sd0 = sp[0];
		sd1 = vis_ld_d64_nf(sp + 1);
		sd2 = vis_ld_d64_nf(sp + 2);
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		st0 = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		st1 = vis_bshuffle(dd3, dd3);
		st2 = vis_bshuffle(dd1, dd2);
		vis_pst_16(st0, dp0, emask);
		vis_pst_16(st1, dp1, emask);
		vis_pst_16(st2, dp2, emask);
	}
Beispiel #4
0
mlib_status
__mlib_VectorNorm_S16_Sat(
	mlib_d64 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *px = (mlib_s16 *)x;
	mlib_s16 *pxend;
	mlib_d64 *dpx, *dpxend;
	mlib_d64 dx, ds, ds1;
	mlib_d64 edge[2];
	type_union_mlib_d64 dr, dr1;
	mlib_s32 d_left;
	mlib_u8 emask;

	edge[0] = edge[1] = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	ds = ds1 = 0;
	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	pxend = px + n - 1;

	emask = vis_edge16(px, pxend);
	vis_pst_16(dpx[0], edge, emask);
	dx = edge[0];

	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	d_left = dpxend - dpx;

	for (; d_left > 0; d_left--) {
		NORM16;
		dpx++;
		dx = dpx[0];
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge16(dpx, pxend);
		vis_pst_16(dx, edge + 1, emask);
		dx = edge[1];
		NORM16;
	}

	z[0] = mlib_sqrt(ds + ds1);
	return (MLIB_SUCCESS);
}
void
mlib_v_ImageMulShift_S16(
    mlib_s16 *sp1,
    mlib_s32 stride1,
    mlib_s16 *sp2,
    mlib_s32 stride2,
    mlib_s16 *dp,
    mlib_s32 strided,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 shift)
{
/* pointers for line of source1 */
	mlib_s16 *sl1;

/* pointers for line of source2 */
	mlib_s16 *sl2;

/* pointers for line of dst */
	mlib_s16 *dl;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;
	mlib_d64 rdhh, rdhl;
	mlib_d64 rdlh, rdll;
	mlib_d64 rdh, rdl;
	mlib_s32 i, j, k;

	if (width == stride1 && width == stride2 && width == strided) {
		width *= height;
		height = 1;
	}

/* initialize GSR scale factor */
	vis_write_gsr(((16 - shift) & 0x1f) << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = spp2[0];

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = spp1[0];

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10, spp1[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = (*spp1++);
			sd20 = (*spp2++);
			MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(spp1[0], spp1[1]);
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(spp2[0], spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = spp1[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (width - 4); k += 4) {
				sd10 = sd11;
				sd11 = spp1[1];
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, spp1[1]);

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = spp2[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}
}
void mlib_v_ImageLookUp_S32_S16_3_D1(mlib_s32 *src,
                                     mlib_s16 *dst,
                                     mlib_s32 xsize,
                                     mlib_s16 *table0,
                                     mlib_s16 *table1,
                                     mlib_s16 *table2)
{
  mlib_s32 *sp;              /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;   /* source data */
  mlib_s16 *dl;              /* pointer to start of destination */
  mlib_s16 *dend;            /* pointer to end of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 acc0;             /* destination data */
  mlib_s32 emask;            /* edge mask */
  mlib_s32 i, num;           /* loop variable */
  mlib_s16 *table;

  dl   = dst;
  sp   = src;
  dp   = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *) 0, 6);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, sp += 4) {
      t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3));
      t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2));
      t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1));
      t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc0;
      table = table0; table0 = table1;
      table1 = table2; table2 = table;
    }
    t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3));
    t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2));
    t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1));
    t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
    table = table0; table0 = table1;
    table1 = table2; table2 = table;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_s32)((mlib_s16*) dend - (mlib_s16*) dp);
    sp  += num;
    num ++;
    if (num == 1) {
      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    } else if (num  == 2) {
      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table1, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    } else if (num == 3) {
      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table2, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table1, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    }
    emask = vis_edge16(dp, dend);
    vis_pst_16(acc0, dp, emask);
  }
}
Beispiel #7
0
void mlib_v_ImageLookUp_U16_U16_124_D1(const mlib_u16 *src,
                                       mlib_u16       *dst,
                                       mlib_s32       xsize,
                                       const mlib_u16 *table0,
                                       const mlib_u16 *table1,
                                       const mlib_u16 *table2,
                                       const mlib_u16 *table3)
{
  mlib_u16 *sp;            /* pointer to source data */
  mlib_s32 s0, s1, s2, s3; /* source data */
  mlib_u16 *dl;            /* pointer to start of destination */
  mlib_u16 *dend;          /* pointer to end of destination */
  mlib_d64 *dp;            /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;     /* destination data */
  mlib_d64 t3, acc0;       /* destination data */
  mlib_s32 emask;          /* edge mask */
  mlib_s32 i, num;         /* loop variable */

  dl   = dst;
  sp   = (void *)src;
  dp   = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *) 0, 6);

  i = 0;

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, sp += 4) {
      t3 = VIS_LD_U16_I(table3, 2*s3);
      t2 = VIS_LD_U16_I(table2, 2*s2);
      t1 = VIS_LD_U16_I(table1, 2*s1);
      t0 = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc0;
    }

    t3 = VIS_LD_U16_I(table3, 2*s3);
    t2 = VIS_LD_U16_I(table2, 2*s2);
    t1 = VIS_LD_U16_I(table1, 2*s1);
    t0 = VIS_LD_U16_I(table0, 2*s0);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16*) dend - (mlib_u16*) dp;
    sp  += num;
    num ++;

    if (num == 1) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    } else if (num  == 2) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table1, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    } else if (num == 3) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table2, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table1, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc0, dp, emask);
  }
}
void
mlib_v_ImageSquare_S16(
    mlib_s16 *src,
    mlib_s32 slb,
    mlib_s16 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize)
{
/* aligned pointer to source */
	mlib_d64 *sp;

/* pointer to a line in source */
	mlib_s16 *sl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* pointer to a line in destination */
	mlib_s16 *dl;

/* pointer to end of a line in dst */
	mlib_s16 *dend;

/* offset of address alignment in dst */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, j, n;

	sl = src;
	dl = dst;

/* row loop */
	for (j = 0; j < ysize; j++) {

/* prepare the destination address */
		dp = (mlib_d64 *)((mlib_addr)dl & (~7));
		off = (mlib_addr)dp - (mlib_addr)dl;
		dend = dl + xsize - 1;

/* prepare the source address */
		sp = (mlib_d64 *)vis_alignaddr(sl, off);

/* generate edge mask for the start point */
		emask = vis_edge16(dl, dend);

/* first 4 pixels */
		s0 = vis_ld_d64_nf(sp); sp++;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQUARE_S16(sd, dd);
		vis_pst_16(dd, dp++, emask);

		n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 4-pixel column loop */
#pragma pipeloop(0)
		for (i = 0; i < n; i++) {
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQUARE_S16(sd, dd);
			(*dp++) = dd;
		}

/* end point handling */

		if ((mlib_addr)dp <= (mlib_addr)dend) {
			emask = vis_edge16(dp, dend);
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQUARE_S16(sd, dd);
			vis_pst_16(dd, dp++, emask);
		}

		sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
		dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
	}
}
mlib_status
__mlib_VideoUpSample420_S16(
	mlib_s16 *dst0,
	mlib_s16 *dst1,
	const mlib_s16 *src0,
	const mlib_s16 *src1,
	const mlib_s16 *src2,
	mlib_s32 n)
{
	mlib_s16 *dend = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d02, d03;
	mlib_d64 d10, d11, d12, d13;
	mlib_d64 d20, d21, d22, d23;
	mlib_d64 ac00, ac01, ac02, ac03, ac04, ac05, ac06, ac07;
	mlib_d64 ac10, ac11, ac12, ac13, ac14, ac15, ac16, ac17;
	mlib_d64 ac20, ac21, ac22, ac23, ac24, ac25, ac26, ac27;
	mlib_f32 f13 = vis_to_float(0x10003);
	mlib_f32 f31 = vis_to_float(0x30001);
	mlib_f32 f39 = vis_to_float(0x30009);
	mlib_f32 f93 = vis_to_float(0x90003);
	mlib_d64 d87 = vis_to_double(8, 7);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((12 << 3) + 2);

	d01 = vis_ld_d64_nf(sp0);
	d11 = vis_ld_d64_nf(sp1);
	d21 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	d00 = vis_faligndata(d00, d01);
	d10 = vis_faligndata(d10, d11);
	d20 = vis_faligndata(d20, d21);

#pragma pipeloop(0)
	for (i = 0; i <= n - 4; i += 4) {
		d03 = vis_ld_d64_nf(sp0);
		d13 = vis_ld_d64_nf(sp1);
		d23 = vis_ld_d64_nf(sp2);
		sp0++;
		sp1++;
		sp2++;
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10));
		ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11));
		ac10 = vis_fpadd32(ac10, d87);
		ac12 = vis_fpadd32(ac12, d87);
		ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11));
		ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12));
		ac10 = vis_fpadd32(ac10, ac11);
		ac12 = vis_fpadd32(ac12, ac13);

		ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00));
		ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01));
		ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01));
		ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02));
		ac00 = vis_fpadd32(ac00, ac01);
		ac02 = vis_fpadd32(ac02, ac03);
		ac00 = vis_fpadd32(ac10, ac00);
		ac02 = vis_fpadd32(ac12, ac02);

		ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20));
		ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21));
		ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21));
		ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22));
		ac20 = vis_fpadd32(ac20, ac21);
		ac22 = vis_fpadd32(ac22, ac23);
		ac20 = vis_fpadd32(ac10, ac20);
		ac22 = vis_fpadd32(ac12, ac22);

		dp0[0] = vis_fpackfix_pair(ac00, ac02);
		dp1[0] = vis_fpackfix_pair(ac20, ac22);

		dp0 += 2;
		dp1 += 2;
		d00 = d02;
		d01 = d03;
		d10 = d12;
		d11 = d13;
		d20 = d22;
		d21 = d23;
	}

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;
	sp0 = (mlib_d64 *)src0;
	sp1 = (mlib_d64 *)src1;
	sp2 = (mlib_d64 *)src2;
	d01 = vis_ld_d64_nf(sp0);
	d11 = vis_ld_d64_nf(sp1);
	d21 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	d00 = vis_faligndata(d00, d01);
	d10 = vis_faligndata(d10, d11);
	d20 = vis_faligndata(d20, d21);

#pragma pipeloop(0)
	for (i = 0; i <= n - 4; i += 4) {
		d03 = vis_ld_d64_nf(sp0);
		d13 = vis_ld_d64_nf(sp1);
		d23 = vis_ld_d64_nf(sp2);
		sp0++;
		sp1++;
		sp2++;
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12));
		ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11));
		ac14 = vis_fpadd32(ac14, d87);
		ac16 = vis_fpadd32(ac16, d87);
		ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11));
		ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12));
		ac14 = vis_fpadd32(ac14, ac15);
		ac16 = vis_fpadd32(ac16, ac17);

		ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02));
		ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01));
		ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01));
		ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02));
		ac04 = vis_fpadd32(ac04, ac05);
		ac06 = vis_fpadd32(ac06, ac07);
		ac04 = vis_fpadd32(ac14, ac04);
		ac06 = vis_fpadd32(ac16, ac06);

		ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22));
		ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21));
		ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21));
		ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22));
		ac24 = vis_fpadd32(ac24, ac25);
		ac26 = vis_fpadd32(ac26, ac27);
		ac24 = vis_fpadd32(ac14, ac24);
		ac26 = vis_fpadd32(ac16, ac26);

		dp0[1] = vis_fpackfix_pair(ac04, ac06);
		dp1[1] = vis_fpackfix_pair(ac24, ac26);

		dp0 += 2;
		dp1 += 2;
		d00 = d02;
		d01 = d03;
		d10 = d12;
		d11 = d13;
		d20 = d22;
		d21 = d23;
	}

	if ((mlib_s16 *)dp0 <= dend) {
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10));
		ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11));
		ac10 = vis_fpadd32(ac10, d87);
		ac12 = vis_fpadd32(ac12, d87);
		ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11));
		ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12));
		ac10 = vis_fpadd32(ac10, ac11);
		ac12 = vis_fpadd32(ac12, ac13);

		ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00));
		ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01));
		ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01));
		ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02));
		ac00 = vis_fpadd32(ac00, ac01);
		ac02 = vis_fpadd32(ac02, ac03);
		ac00 = vis_fpadd32(ac10, ac00);
		ac02 = vis_fpadd32(ac12, ac02);

		ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20));
		ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21));
		ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21));
		ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22));
		ac20 = vis_fpadd32(ac20, ac21);
		ac22 = vis_fpadd32(ac22, ac23);
		ac20 = vis_fpadd32(ac10, ac20);
		ac22 = vis_fpadd32(ac12, ac22);

		ac00 = vis_fpackfix_pair(ac00, ac02);
		ac20 = vis_fpackfix_pair(ac20, ac22);
		emask = vis_edge16(dp0, dend);
		vis_pst_16(ac00, dp0, emask);
		vis_pst_16(ac20, dp1, emask);
		dp0++;
		dp1++;

		if ((mlib_s16 *)dp0 <= dend) {
			ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12));
			ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11));
			ac14 = vis_fpadd32(ac14, d87);
			ac16 = vis_fpadd32(ac16, d87);
			ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11));
			ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12));
			ac14 = vis_fpadd32(ac14, ac15);
			ac16 = vis_fpadd32(ac16, ac17);

			ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02));
			ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01));
			ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01));
			ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02));
			ac04 = vis_fpadd32(ac04, ac05);
			ac06 = vis_fpadd32(ac06, ac07);
			ac04 = vis_fpadd32(ac14, ac04);
			ac06 = vis_fpadd32(ac16, ac06);

			ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22));
			ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21));
			ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21));
			ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22));
			ac24 = vis_fpadd32(ac24, ac25);
			ac26 = vis_fpadd32(ac26, ac27);
			ac24 = vis_fpadd32(ac14, ac24);
			ac26 = vis_fpadd32(ac16, ac26);

			ac04 = vis_fpackfix_pair(ac04, ac06);
			ac24 = vis_fpackfix_pair(ac24, ac26);
			emask = vis_edge16(dp0, dend);
			vis_pst_16(ac04, dp0, emask);
			vis_pst_16(ac24, dp1, emask);
		}
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
Beispiel #10
0
mlib_status
mlib_v_ImageAdd_S16(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;

	VALIDATE(mlib_s16);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 4); k += 4) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_SignalEmphasize_S16S_S16S_Sat(
    mlib_s16 *dst,
    const mlib_s16 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_emphasize_struct *fist = filter;
	mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF);
	mlib_d64 w_maskor0  = vis_freg_pair(0.f, fist->v16_last0);
	mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000);
	mlib_d64 w_maskor1  = vis_freg_pair(0.f, fist->v16_last1);
	mlib_f32 v_mask	    = vis_to_float(0x80008000);
	mlib_f32 v_alpha    = fist->v_alpha;
	mlib_s16 *fdst	    = dst + n + n - 1;
	mlib_d64 *dpd, *dps, *dsrct1;
	mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc;
	mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7;
	mlib_s32 i, times, t1, t2;

/* check for obvious errors */

	if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) ||
	    (fist->type != MLIB_EMPH)) {
		return (MLIB_FAILURE);
	}

	vis_write_gsr(1 << 3);
	w_maskor0 = vis_fand(w_maskor0, w_maskand1);
	w_maskor1 = vis_fand(w_maskor1, w_maskand0);

	vis_alignaddr((void *)(-(mlib_addr)src), 0);
	w_maskand0 = vis_faligndata(w_maskand0, w_maskand0);
	w_maskor0 = vis_faligndata(w_maskor0, w_maskor0);
	w_maskand1 = vis_faligndata(w_maskand1, w_maskand1);
	w_maskor1 = vis_faligndata(w_maskor1, w_maskor1);

	dpd = vis_alignaddr(dst, 0);
	times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd;
	t1 = -((mlib_addr)(dst) & 7);
	t2 = t1 - 4;
	dps = vis_alignaddr((void *)src, t2);
	w_src0 = vis_ld_d64_nf(dps);
	dps++;
	w_src1 = vis_ld_d64_nf(dps);
	dps++;

	if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) {
		if (((mlib_addr)dps - (mlib_addr)src) >= 6) {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand0, w_src1);
			w_src1 = vis_for(w_maskor0, w_src1);
		}

		if (((mlib_addr)dps - (mlib_addr)src) >= 8) {
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
		}

		w_lsrc = vis_faligndata(w_src0, w_src1);
		dsrct1 = vis_alignaddr((void *)src, t1);

		if (dps - 2 != dsrct1) {
			w_src2 = *dps;
			dps++;
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = *dps;
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dpd++;
				dps++;
			}
		} else {
			w_src = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;

			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			MLIB_MIX;
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;
				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;

				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;
				dpd++;
			}
		}
	} else {
		w_src = w_src1;

		if ((mlib_addr)src & 7) {
			times--;

			if (((mlib_addr)src & 7) == 2) {
				w_src0 = vis_fand(w_maskand0, w_src0);
				w_src0 = vis_for(w_maskor0, w_src0);
			} else {
				w_src1 = vis_fand(w_maskand0, w_src1);
				w_src1 = vis_for(w_maskor0, w_src1);
			}

			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			w_src0 = w_src1;
			w_src1 = *dps;
			w_src = w_src1;
			w_lsrc = vis_faligndata(w_src0, w_src1);
			dps++;

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
			dpd++;
		} else {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;
		}

		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);

		MLIB_MIX;

		w_src1 = w_src;
		w_dst = vis_fpackfix_pair(dr2, dr3);
		dps++;
		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);
		dps++;

		for (i = 0; i < times; i++) {
			*dpd = w_dst;

			MLIB_MIX;

			w_src1 = w_src;
			w_src = vis_ld_d64_nf(dps);
			w_lsrc = vis_faligndata(w_src1, w_src);
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			dpd++;

		}
	}

	if (times >= 0) {
		vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst));
	}
	((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2];
	((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1];

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUp_S16_S16_3_D1(
    const mlib_s16 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 *table0,
    const mlib_s16 *table1,
    const mlib_s16 *table2)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* pointer to start of destination */
	mlib_s16 *dl;

/* pointer to end of destination */
	mlib_s16 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2, t3;

/* destination data */
	mlib_d64 acc0, acc1;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;
	const mlib_s16 *table;

	dl = dst;
	sp = (void *)src;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 6);

	i = 0;

	if (xsize >= 4) {

		s0 = sp[0] << 1;
		s1 = sp[1] << 1;
		s2 = sp[2] << 1;
		s3 = sp[3] << 1;
		sp += 4;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
			t3 = VIS_LD_U16_I(table0, s3);
			t2 = VIS_LD_U16_I(table2, s2);
			t1 = VIS_LD_U16_I(table1, s1);
			t0 = VIS_LD_U16_I(table0, s0);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0] << 1;
			s1 = sp[1] << 1;
			s2 = sp[2] << 1;
			s3 = sp[3] << 1;
			(*dp++) = vis_bshuffle(acc0, acc1);
			table = table0;
			table0 = table1;
			table1 = table2;
			table2 = table;
		}

		t3 = VIS_LD_U16_I(table0, s3);
		t2 = VIS_LD_U16_I(table2, s2);
		t1 = VIS_LD_U16_I(table1, s1);
		t0 = VIS_LD_U16_I(table0, s0);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
		table = table0;
		table0 = table1;
		table1 = table2;
		table2 = table;
		i += 4;
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_s16 *)dend - (mlib_s16 *)dp;
		sp += num;
		num++;

		if (num == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table2, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge16(dp, dend);
		vis_pst_16(acc0, dp, emask);
	}
}
mlib_status
__mlib_SignalMulBartlett_F32S_F32S(
    mlib_f32 *dst,
    const mlib_f32 *src,
    mlib_s32 n)
{
	mlib_union64 tmp_1, tmp_2;
	mlib_d64 step, base, tmp1, tmp2;
	mlib_d64 tmp1_new, tmp2_new, tmp1_last, tmp2_last;
	mlib_s32 n1, i;
	mlib_f32 *dst2 = dst + 2 * n - 1;
	const mlib_f32 *src2 = src + 2 * n - 1;
	mlib_d64 *pdst, *pdst2;
	mlib_u8 emask1, emask2;

	if ((dst == NULL) || (n <= 1) || (src == NULL))
		return (MLIB_FAILURE);

	n1 = n - 1;
	step = 2. / n1;
	base = 0.;

	if ((mlib_addr)dst & 7) {

		pdst = vis_alignaddr((void *)dst, 0);
		pdst2 = (mlib_d64 *)dst2;
		emask1 = 3;
		emask2 = 12;

		tmp1_last = vis_to_double_dup(0);
		src += 2;
		tmp2_last = vis_to_double_dup(0);
		src2 -= 2;

		vis_pst_16(tmp1_last, pdst, emask1);
		vis_pst_16(tmp2_last, pdst2, emask2);

		pdst++;
		pdst2--;
		base += step;
		n = n - 1;

#pragma pipeloop(0)
		for (i = 0; i < n / 2; i++) {

			tmp_1.f32x2.i0 = base * src[0];
			tmp_1.f32x2.i1 = base * src[1];
			tmp1 = tmp_1.d64;
			src += 2;

			tmp_2.f32x2.i1 = base * src2[0];
			src2--;
			tmp_2.f32x2.i0 = base * src2[0];
			tmp2 = tmp_2.d64;
			src2--;

			tmp1_new = vis_faligndata(tmp1_last, tmp1);
			tmp1_last = tmp1;

			tmp2_new = vis_faligndata(tmp2, tmp2_last);
			tmp2_last = tmp2;

			pdst[0] = tmp1_new;
			pdst++;
			pdst2[0] = tmp2_new;
			pdst2--;

			base += step;
		}

		if (n & 1) {
			dst += i * 2 + 1;
			src--;
			dst[0] = src[0] * (base - step);
			dst[1] = src[1] * (base - step);
		}
	} else {

		pdst = (mlib_d64 *)dst;
		pdst2 = (mlib_d64 *)(dst2 - 1);

#pragma pipeloop(0)
		for (i = 0; i < n / 2; i++) {

			tmp_1.f32x2.i0 = base * src[0];
			tmp_1.f32x2.i1 = base * src[1];
			tmp1 = tmp_1.d64;

			tmp_2.f32x2.i1 = base * src2[0];
			src2--;
			tmp_2.f32x2.i0 = base * src2[0];
			tmp2 = tmp_2.d64;

			pdst[0] = tmp1;
			pdst++;
			pdst2[0] = tmp2;
			pdst2--;
			src2--;
			src += 2;

			base += step;
		}

		if (n & 1) {
			dst += i * 2;
			dst[0] = src[0] * base;
			dst[1] = src[1] * base;
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_SignalMulBartlett_F32S(
    mlib_f32 *data,
    mlib_s32 n)
{
	mlib_union64 tmp_1, tmp_2;
	mlib_d64 step, base, tmp1, tmp2;
	mlib_d64 tmp1_new, tmp2_new, tmp1_last, tmp2_last;
	mlib_s32 n1, i;
	mlib_f32 *data2 = data + 2 * n - 1;
	mlib_d64 *pdata, *pdata2;
	mlib_u8 emask1, emask2;

	if ((data == NULL) || (n <= 1))
		return (MLIB_FAILURE);

	n1 = n - 1;
	step = 2. / n1;
	base = 0.;

	if ((mlib_addr)data & 7) {

		pdata = vis_alignaddr((void *)data, 0);
		pdata2 = (mlib_d64 *)data2;
		emask1 = 3;
		emask2 = 12;

		tmp1_last = vis_to_double_dup(0);
		data += 2;
		tmp2_last = vis_to_double_dup(0);
		data2 -= 2;

		vis_pst_16(tmp1_last, pdata, emask1);
		vis_pst_16(tmp2_last, pdata2, emask2);

		pdata++;
		pdata2--;
		base += step;
		n = n - 1;

#pragma pipeloop(0)
		for (i = 0; i < n / 2; i++) {

			tmp_1.f32x2.i0 = base * data[0];
			tmp_1.f32x2.i1 = base * data[1];
			tmp1 = tmp_1.d64;
			data += 2;

			tmp_2.f32x2.i1 = base * data2[0];
			data2--;
			tmp_2.f32x2.i0 = base * data2[0];
			tmp2 = tmp_2.d64;
			data2--;

			tmp1_new = vis_faligndata(tmp1_last, tmp1);
			tmp1_last = tmp1;

			tmp2_new = vis_faligndata(tmp2, tmp2_last);
			tmp2_last = tmp2;

			pdata[0] = tmp1_new;
			pdata++;
			pdata2[0] = tmp2_new;
			pdata2--;

			base += step;
		}

		if (n & 1) {
			data--;
			data[0] = data[0] * (base - step);
			data[1] = data[1] * (base - step);
		}
	} else {

		pdata = (mlib_d64 *)data;
		pdata2 = (mlib_d64 *)(data2 - 1);

#pragma pipeloop(0)
		for (i = 0; i < n / 2; i++) {

			tmp_1.f32x2.i0 = base * data[0];
			tmp_1.f32x2.i1 = base * data[1];
			tmp1 = tmp_1.d64;

			tmp_2.f32x2.i1 = base * data2[0];
			data2--;
			tmp_2.f32x2.i0 = base * data2[0];
			tmp2 = tmp_2.d64;

			pdata[0] = tmp1;
			pdata++;
			pdata2[0] = tmp2;
			pdata2--;
			data2--;
			data += 2;

			base += step;
		}

		if (n & 1) {
			data--;
			data[0] = data[0] * base;
			data[1] = data[1] * base;
		}
	}

	return (MLIB_SUCCESS);
}
Beispiel #15
0
mlib_status
mlib_ImageAbs_S16(
    mlib_s16 *dst,
    mlib_s16 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
{
/* 8-byte aligned src, dst ptrs */
	mlib_d64 *sp, *dp;

/* unaligned data */
	mlib_d64 prev;
	mlib_d64 curr0;
	mlib_d64 curr1, curr2;

/* aligned data */
	mlib_d64 adat0;

/* absolute values of result */
	mlib_d64 dabs;

/* aligned data */
	mlib_d64 adat1, adat2, adat3;

/* absolute values of result */
	mlib_d64 dabs0, dabs1, dabs2, dabs3;
	mlib_d64 dtwo = vis_to_double_dup(0x20002);
	mlib_d64 mask = vis_to_double_dup(0x80008000);

/* last pixel of line */
	mlib_s16 *dlast;

/* bit mask results of comp */
	mlib_s32 mask0;

/* pxl count of source line */
	mlib_s32 slpxl = slb >> 1;

/* pxl count of destination line */
	mlib_s32 dlpxl = dlb >> 1;

/* dst offset for address alignment */
	mlib_s32 doffs;
	mlib_s32 row, block;

/* full blocks, each of N d64s */
	mlib_s32 numblocks;

	for (row = 0; row < hgt; row++) {

/* ROW SETUP */

/* last dst pixel in row */
		dlast = dst + wid - 1;
		doffs = (mlib_addr)dst & 7;
/* aligned dest ptr */
		dp = (mlib_d64 *)((mlib_addr)dst & ~7);
/* aligned src ptr */
		sp = (mlib_d64 *)vis_alignaddr(src, -(mlib_s32)doffs);
		prev = *sp;

/* FIRST d64 NEEDS EDGE MASK FOR DESTINATION START POINT */

/* edge mask for start point */
		mask0 = vis_edge16(dst, dlast);
		READ_PXLS_UNALIGN;
		CALC_ABS_S16;
		vis_pst_16(dabs, dp++, mask0);
		numblocks = ((mlib_u8 *)dlast + 1 - (mlib_u8 *)dp) >> 3;

/* DO MOST OF ROW IN BLOCKS OF N d64s */

		if ((((mlib_addr)src ^ (mlib_addr)dst) & 7) == 0) {
#pragma pipeloop(0)
			for (block = 0; block < numblocks - 3; block += 4) {
				adat0 = sp[0];
				adat1 = sp[1];
				adat2 = sp[2];

				CALC_ABS_S16_UNROLL(dabs0, adat0);
				adat3 = sp[3];
				dp[0] = dabs0;
				CALC_ABS_S16_UNROLL(dabs1, adat1);
				dp[1] = dabs1;
				CALC_ABS_S16_UNROLL(dabs2, adat2);
				dp[2] = dabs2;
				CALC_ABS_S16_UNROLL(dabs3, adat3);
				dp[3] = dabs3;

				sp += 4;
				dp += 4;
			}

#pragma pipeloop(0)
			for (; block < numblocks; block++) {
				READ_PXLS_ALIGN;
				CALC_ABS_S16;
				STORE_ABS_VALUES;
			}

			prev = *sp;
		} else {
#pragma pipeloop(0)
			for (block = 0; block < numblocks - 3; block += 4) {
				curr0 = *(sp + 1);
				curr1 = *(sp + 2);
				curr2 = *(sp + 3);

				adat0 = vis_faligndata(prev, curr0);
				prev = *(sp + 4);
				CALC_ABS_S16_UNROLL(dabs0, adat0);
				dp[0] = dabs0;
				adat1 = vis_faligndata(curr0, curr1);
				CALC_ABS_S16_UNROLL(dabs1, adat1);
				dp[1] = dabs1;
				adat2 = vis_faligndata(curr1, curr2);
				CALC_ABS_S16_UNROLL(dabs2, adat2);
				dp[2] = dabs2;
				adat3 = vis_faligndata(curr2, prev);
				CALC_ABS_S16_UNROLL(dabs3, adat3);
				dp[3] = dabs3;

				sp += 4;
				dp += 4;
			}

#pragma pipeloop(0)
			for (; block < numblocks; block++) {
				READ_PXLS_UNALIGN;
				CALC_ABS_S16;
				STORE_ABS_VALUES;
			}
		}

/* LAST d64 NEEDS EDGE MASK FOR DESTINATION END POINT */

		if ((mlib_addr)dp <= (mlib_addr)dlast) {
			curr0 = *(++sp);
/* edge mask for end point */
			mask0 = vis_edge16(dp, dlast);
			adat0 = vis_faligndata(prev, curr0);
			CALC_ABS_S16;
			vis_pst_16(dabs, dp, mask0);
		}

/* ptrs to next src row */
		src += slpxl;
/* ptrs to next dst row */
		dst += dlpxl;
	}

	return (MLIB_SUCCESS);
}
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src,
                                           mlib_u8        *dst,
                                           mlib_s32       xsize,
                                           const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_u16 *dl;                        /* pointer to start of destination */
  mlib_u16 *dend;                      /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = (mlib_u16 *) dst;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab1, s3);
      t6 = VIS_LD_U8_I(tab0, s3);
      t5 = VIS_LD_U8_I(tab1, s2);
      t4 = VIS_LD_U8_I(tab0, s2);
      t3 = VIS_LD_U8_I(tab1, s1);
      t2 = VIS_LD_U8_I(tab0, s1);
      t1 = VIS_LD_U8_I(tab1, s0);
      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab1, s3);
    t6 = VIS_LD_U8_I(tab0, s3);
    t5 = VIS_LD_U8_I(tab1, s2);
    t4 = VIS_LD_U8_I(tab0, s2);
    t3 = VIS_LD_U8_I(tab1, s1);
    t2 = VIS_LD_U8_I(tab0, s1);
    t1 = VIS_LD_U8_I(tab1, s0);
    t0 = VIS_LD_U8_I(tab0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16 *) dend - (mlib_u16 *) dp;
    sp += num;
    num++;
#pragma pipeloop(0)
    for (i = 0; i < num; i++) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t0, acc);

      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc, dp, emask);
  }
}
mlib_status
__mlib_VectorConjRev_S16C_S16C_Sat(
	mlib_s16 *zz,
	const mlib_s16 *xx,
	mlib_s32 n)
{
	mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz;
	mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n];
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo =
		vis_to_double(0, 2);
	mlib_f32 f_two = vis_to_float(0x20002);
	mlib_s16 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;

	CHECK(x, z);

	if ((n < 16)) {
		CONJREVC(mlib_s16,
			MLIB_S16_MAX,
			MLIB_S16_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S16_MIN)
			*--dst = MLIB_S16_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(15 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 2);
			d_rest = vis_to_double((re_part << 16), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_16(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}

			vis_write_gsr(2);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_16(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S16_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S16_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageSqrShift_S16_D1(
    mlib_s16 *src,
    mlib_s16 *dst,
    mlib_s32 dsize,
    mlib_s32 shift)
{
/* aligned pointer to source */
	mlib_d64 *sp;

/* pointer to source */
	mlib_s16 *sa;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* pointer to destination */
	mlib_s16 *da;

/* pointer to end of dst */
	mlib_s16 *dend;

/* offset of address alignment in dst */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdhh, rdhl;

/* temporaries used in macro */
	mlib_d64 rdlh, rdll;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, n;

	sa = src;
	da = dst;

/* prepare the destination address */
	dp = (mlib_d64 *)((mlib_addr)da & (~7));
	off = (mlib_addr)dp - (mlib_addr)da;
	dend = da + dsize - 1;

/* prepare the source address */
	sp = (mlib_d64 *)vis_alignaddr(sa, off);

/* generate edge mask for the start point */
	emask = vis_edge16(da, dend);

/* first 4 pixels */
	s0 = vis_ld_d64_nf(sp); sp++;
	s1 = vis_ld_d64_nf(sp); sp++;
	sd = vis_faligndata(s0, s1);
	MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
	vis_pst_16(dd, dp++, emask);

	n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 4-pixel column loop */
#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
		(*dp++) = dd;
	}

/* end point handling */

	if ((mlib_addr)dp <= (mlib_addr)dend) {
		emask = vis_edge16(dp, dend);
		s0 = s1;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
		vis_pst_16(dd, dp++, emask);
	}
}