mlib_status
__mlib_VideoUpSample420_Nearest_S16(
	mlib_s16 *dst0,
	mlib_s16 *dst1,
	const mlib_s16 *src,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)src;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_s16 *dend = dst0 + 2 * n - 1;
	mlib_d64 sa, da, dr, dr1;
	mlib_s32 emask, i;

	if (n <= 0)
		return (MLIB_FAILURE);

#pragma pipeloop(0)
	for (i = 0; i <= (n - 4); i += 4) {
		sa = sp[0];
		sp++;
		dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));
		dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));
		dp0[0] = dp1[0] =
			vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));
		dp0[1] = dp1[1] =
			vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));
		dp0 += 2;
		dp1 += 2;
	}

	if ((mlib_s16 *)dp0 <= dend) {
		sa = sp[0];
		dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));
		dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));
		dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));
		da = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));
		emask = vis_edge16(dp0, dend);
		vis_pst_16(da, dp0, emask);
		vis_pst_16(da, dp1, emask);
		dp0++;
		dp1++;

		if ((mlib_s16 *)dp0 <= dend) {
			da = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));
			emask = vis_edge16(dp0, dend);
			vis_pst_16(da, dp0, emask);
			vis_pst_16(da, dp1, emask);
		}
	}

	return (MLIB_SUCCESS);
}
예제 #2
0
mlib_status
__mlib_VectorNorm_S16_Sat(
	mlib_d64 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s16 *px = (mlib_s16 *)x;
	mlib_s16 *pxend;
	mlib_d64 *dpx, *dpxend;
	mlib_d64 dx, ds, ds1;
	mlib_d64 edge[2];
	type_union_mlib_d64 dr, dr1;
	mlib_s32 d_left;
	mlib_u8 emask;

	edge[0] = edge[1] = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	ds = ds1 = 0;
	dpx = (mlib_d64 *)((mlib_addr)px & (~7));
	pxend = px + n - 1;

	emask = vis_edge16(px, pxend);
	vis_pst_16(dpx[0], edge, emask);
	dx = edge[0];

	dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7));
	d_left = dpxend - dpx;

	for (; d_left > 0; d_left--) {
		NORM16;
		dpx++;
		dx = dpx[0];
	}

	if ((mlib_addr)dpx <= (mlib_addr)pxend) {
		emask = vis_edge16(dpx, pxend);
		vis_pst_16(dx, edge + 1, emask);
		dx = edge[1];
		NORM16;
	}

	z[0] = mlib_sqrt(ds + ds1);
	return (MLIB_SUCCESS);
}
void mlib_v_ImageLookUp_S32_S16_3_D1(mlib_s32 *src,
                                     mlib_s16 *dst,
                                     mlib_s32 xsize,
                                     mlib_s16 *table0,
                                     mlib_s16 *table1,
                                     mlib_s16 *table2)
{
  mlib_s32 *sp;              /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;   /* source data */
  mlib_s16 *dl;              /* pointer to start of destination */
  mlib_s16 *dend;            /* pointer to end of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 acc0;             /* destination data */
  mlib_s32 emask;            /* edge mask */
  mlib_s32 i, num;           /* loop variable */
  mlib_s16 *table;

  dl   = dst;
  sp   = src;
  dp   = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *) 0, 6);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, sp += 4) {
      t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3));
      t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2));
      t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1));
      t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc0;
      table = table0; table0 = table1;
      table1 = table2; table2 = table;
    }
    t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3));
    t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2));
    t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1));
    t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
    table = table0; table0 = table1;
    table1 = table2; table2 = table;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_s32)((mlib_s16*) dend - (mlib_s16*) dp);
    sp  += num;
    num ++;
    if (num == 1) {
      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    } else if (num  == 2) {
      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table1, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    } else if (num == 3) {
      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table2, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;
      sp --;

      t0  = vis_ld_u16_i(table1, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);

      s0 = *sp;

      t0  = vis_ld_u16_i(table0, ((mlib_addr)2*s0));
      acc0 = vis_faligndata(t0, acc0);
    }
    emask = vis_edge16(dp, dend);
    vis_pst_16(acc0, dp, emask);
  }
}
예제 #4
0
mlib_status
__mlib_VectorAdd_S16_S16_Mod(
	mlib_s16 *z,
	const mlib_s16 *x,
	const mlib_s16 *y,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx, *dpy;
	mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1;
	mlib_s16 *pz, *px, *py, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;
	mlib_s32 len = n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s16 *)x;
	py = (mlib_s16 *)y;
	pz = (mlib_s16 *)z;

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge16(pz, pzend);

/*
 * prepare the destination address
 */

	if (off) {
		dpy = (mlib_d64 *)vis_alignaddr(py, off);
		dy0 = vis_ld_d64_nf(dpy);
		dy1 = vis_ld_d64_nf(dpy + 1);
		dy = vis_faligndata(dy0, dy1);
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		ADD16_MOD;
		px += (8 + off) >> 1;
		py += (8 + off) >> 1;
		len -= (8 + off) >> 1;
		dpz++;
	}

	if (len <= 0)
		return (MLIB_SUCCESS);

	even_8 = len >> 2;
	rest_8 = len & 0x3;
	emask = 0xf;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

/*
 * Both addresses are 8-byte aligned. No  vis_alignaddr
 * and  vis_faligndata at all.
 */

		dpx = (mlib_d64 *)px;
		dpy = (mlib_d64 *)py;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			dy = (*dpy++);
			ADD16_MOD;
			dpz++;
		}

		dx1 = vis_ld_d64_nf(dpx); dpx++;
		dy1 = vis_ld_d64_nf(dpy); dpy++;

	} else
	if (!((mlib_addr)px & 7)) {

/*
 * First ("x") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "y".
 */

		dpx = (mlib_d64 *)px;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy); dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			dy1 = vis_ld_d64_nf(dpy); dpy++;
			dy = vis_faligndata(dy0, dy1);
			ADD16_MOD;
			dy0 = dy1;
			dpz++;
		}

		dx1 = vis_ld_d64_nf(dpx); dpx++;
		dy1 = dy0;

	} else
	if (!((mlib_addr)py & 7)) {

/*
 * Second ("y") address is 8-byte aligned. vis_alignaddr
 * and vis_faligndata only for "x".
 */

		dpy = (mlib_d64 *)py;
		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx); dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx); dpx++;
			dx = vis_faligndata(dx0, dx1);
			dy = (*dpy++);
			ADD16_MOD;
			dx0 = dx1;
			dpz++;
		}

		dx1 = dx0;
		dy1 = vis_ld_d64_nf(dpy); dpy++;

	} else
	if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

/*
 * Both ("x" and "y") address are identically aligned.
 * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
 */

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx); dpx++;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy); dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx); dpx++;
			dx = vis_faligndata(dx0, dx1);
			dy1 = vis_ld_d64_nf(dpy); dpy++;
			dy = vis_faligndata(dy0, dy1);
			ADD16_MOD;
			dpz++;
			dx0 = dx1;
			dy0 = dy1;
		}

		dx1 = dx0;
		dy1 = dy0;
	} else {

/*
 * Both ("x" and "y") address are arbitrary aligned.
 */

		off = (mlib_addr)px & 7;
		dpx = (mlib_d64 *)((mlib_u8 *)px - off);
		vis_write_bmask(off * 0x11111111, 0x01234567);
		dx0 = vis_ld_d64_nf(dpx); dpx++;
		dpy = vis_alignaddr(py, 0);
		dy0 = vis_ld_d64_nf(dpy); dpy++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx); dpx++;
			dx = vis_bshuffle(dx0, dx1);
			dy1 = vis_ld_d64_nf(dpy); dpy++;
			dy = vis_faligndata(dy0, dy1);
			ADD16_MOD;
			dx0 = dx1;
			dy0 = dy1;
			dpz++;
		}

		dx1 = dx0;
		dy1 = dy0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

/*
 * prepare edge mask for the last bytes
 */

	emask = ~(vis_edge16((void *)(rest_8 << 1), pzend));

	off = (mlib_addr)px & 7;
	vis_write_bmask(off * 0x11111111, 0x01234567);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_bshuffle(dx0, dx1);
	vis_alignaddr(py, 0);
	dy0 = dy1;
	dy1 = vis_ld_d64_nf(dpy);
	dy = vis_faligndata(dy0, dy1);

	ADD16_MOD;

	return (MLIB_SUCCESS);
}
예제 #5
0
void mlib_v_ImageLookUp_U16_U16_124_D1(const mlib_u16 *src,
                                       mlib_u16       *dst,
                                       mlib_s32       xsize,
                                       const mlib_u16 *table0,
                                       const mlib_u16 *table1,
                                       const mlib_u16 *table2,
                                       const mlib_u16 *table3)
{
  mlib_u16 *sp;            /* pointer to source data */
  mlib_s32 s0, s1, s2, s3; /* source data */
  mlib_u16 *dl;            /* pointer to start of destination */
  mlib_u16 *dend;          /* pointer to end of destination */
  mlib_d64 *dp;            /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;     /* destination data */
  mlib_d64 t3, acc0;       /* destination data */
  mlib_s32 emask;          /* edge mask */
  mlib_s32 i, num;         /* loop variable */

  dl   = dst;
  sp   = (void *)src;
  dp   = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *) 0, 6);

  i = 0;

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, sp += 4) {
      t3 = VIS_LD_U16_I(table3, 2*s3);
      t2 = VIS_LD_U16_I(table2, 2*s2);
      t1 = VIS_LD_U16_I(table1, 2*s1);
      t0 = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc0;
    }

    t3 = VIS_LD_U16_I(table3, 2*s3);
    t2 = VIS_LD_U16_I(table2, 2*s2);
    t1 = VIS_LD_U16_I(table1, 2*s1);
    t0 = VIS_LD_U16_I(table0, 2*s0);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16*) dend - (mlib_u16*) dp;
    sp  += num;
    num ++;

    if (num == 1) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    } else if (num  == 2) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table1, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    } else if (num == 3) {
      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table2, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table1, 2*s0);
      acc0 = vis_faligndata(t0, acc0);

      s0 = (mlib_s32) *sp;
      sp --;

      t0  = VIS_LD_U16_I(table0, 2*s0);
      acc0 = vis_faligndata(t0, acc0);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc0, dp, emask);
  }
}
예제 #6
0
void
mlib_v_ImageSquare_S16(
    mlib_s16 *src,
    mlib_s32 slb,
    mlib_s16 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize)
{
/* aligned pointer to source */
	mlib_d64 *sp;

/* pointer to a line in source */
	mlib_s16 *sl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* pointer to a line in destination */
	mlib_s16 *dl;

/* pointer to end of a line in dst */
	mlib_s16 *dend;

/* offset of address alignment in dst */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, j, n;

	sl = src;
	dl = dst;

/* row loop */
	for (j = 0; j < ysize; j++) {

/* prepare the destination address */
		dp = (mlib_d64 *)((mlib_addr)dl & (~7));
		off = (mlib_addr)dp - (mlib_addr)dl;
		dend = dl + xsize - 1;

/* prepare the source address */
		sp = (mlib_d64 *)vis_alignaddr(sl, off);

/* generate edge mask for the start point */
		emask = vis_edge16(dl, dend);

/* first 4 pixels */
		s0 = vis_ld_d64_nf(sp); sp++;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQUARE_S16(sd, dd);
		vis_pst_16(dd, dp++, emask);

		n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 4-pixel column loop */
#pragma pipeloop(0)
		for (i = 0; i < n; i++) {
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQUARE_S16(sd, dd);
			(*dp++) = dd;
		}

/* end point handling */

		if ((mlib_addr)dp <= (mlib_addr)dend) {
			emask = vis_edge16(dp, dend);
			s0 = s1;
			s1 = vis_ld_d64_nf(sp); sp++;
			sd = vis_faligndata(s0, s1);
			MLIB_V_IMAGESQUARE_S16(sd, dd);
			vis_pst_16(dd, dp++, emask);
		}

		sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
		dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
	}
}
예제 #7
0
void
mlib_v_ImageSqrShift_S16_D1(
    mlib_s16 *src,
    mlib_s16 *dst,
    mlib_s32 dsize,
    mlib_s32 shift)
{
/* aligned pointer to source */
	mlib_d64 *sp;

/* pointer to source */
	mlib_s16 *sa;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* pointer to destination */
	mlib_s16 *da;

/* pointer to end of dst */
	mlib_s16 *dend;

/* offset of address alignment in dst */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;

/* source data */
	mlib_d64 s0, s1;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdhh, rdhl;

/* temporaries used in macro */
	mlib_d64 rdlh, rdll;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, n;

	sa = src;
	da = dst;

/* prepare the destination address */
	dp = (mlib_d64 *)((mlib_addr)da & (~7));
	off = (mlib_addr)dp - (mlib_addr)da;
	dend = da + dsize - 1;

/* prepare the source address */
	sp = (mlib_d64 *)vis_alignaddr(sa, off);

/* generate edge mask for the start point */
	emask = vis_edge16(da, dend);

/* first 4 pixels */
	s0 = vis_ld_d64_nf(sp); sp++;
	s1 = vis_ld_d64_nf(sp); sp++;
	sd = vis_faligndata(s0, s1);
	MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
	vis_pst_16(dd, dp++, emask);

	n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8;

/* 4-pixel column loop */
#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
		(*dp++) = dd;
	}

/* end point handling */

	if ((mlib_addr)dp <= (mlib_addr)dend) {
		emask = vis_edge16(dp, dend);
		s0 = s1;
		s1 = vis_ld_d64_nf(sp); sp++;
		sd = vis_faligndata(s0, s1);
		MLIB_V_IMAGESQRSHIFT_S16(sd, dd);
		vis_pst_16(dd, dp++, emask);
	}
}
mlib_status
__mlib_VideoUpSample420_S16(
	mlib_s16 *dst0,
	mlib_s16 *dst1,
	const mlib_s16 *src0,
	const mlib_s16 *src1,
	const mlib_s16 *src2,
	mlib_s32 n)
{
	mlib_s16 *dend = dst0 + 2 * n - 1;
	mlib_d64 *dp0 = (mlib_d64 *)dst0;
	mlib_d64 *dp1 = (mlib_d64 *)dst1;
	mlib_d64 *sp0 = (mlib_d64 *)src0;
	mlib_d64 *sp1 = (mlib_d64 *)src1;
	mlib_d64 *sp2 = (mlib_d64 *)src2;
	mlib_d64 d00, d01, d02, d03;
	mlib_d64 d10, d11, d12, d13;
	mlib_d64 d20, d21, d22, d23;
	mlib_d64 ac00, ac01, ac02, ac03, ac04, ac05, ac06, ac07;
	mlib_d64 ac10, ac11, ac12, ac13, ac14, ac15, ac16, ac17;
	mlib_d64 ac20, ac21, ac22, ac23, ac24, ac25, ac26, ac27;
	mlib_f32 f13 = vis_to_float(0x10003);
	mlib_f32 f31 = vis_to_float(0x30001);
	mlib_f32 f39 = vis_to_float(0x30009);
	mlib_f32 f93 = vis_to_float(0x90003);
	mlib_d64 d87 = vis_to_double(8, 7);
	mlib_s32 i, emask;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr((12 << 3) + 2);

	d01 = vis_ld_d64_nf(sp0);
	d11 = vis_ld_d64_nf(sp1);
	d21 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	d00 = vis_faligndata(d00, d01);
	d10 = vis_faligndata(d10, d11);
	d20 = vis_faligndata(d20, d21);

#pragma pipeloop(0)
	for (i = 0; i <= n - 4; i += 4) {
		d03 = vis_ld_d64_nf(sp0);
		d13 = vis_ld_d64_nf(sp1);
		d23 = vis_ld_d64_nf(sp2);
		sp0++;
		sp1++;
		sp2++;
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10));
		ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11));
		ac10 = vis_fpadd32(ac10, d87);
		ac12 = vis_fpadd32(ac12, d87);
		ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11));
		ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12));
		ac10 = vis_fpadd32(ac10, ac11);
		ac12 = vis_fpadd32(ac12, ac13);

		ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00));
		ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01));
		ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01));
		ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02));
		ac00 = vis_fpadd32(ac00, ac01);
		ac02 = vis_fpadd32(ac02, ac03);
		ac00 = vis_fpadd32(ac10, ac00);
		ac02 = vis_fpadd32(ac12, ac02);

		ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20));
		ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21));
		ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21));
		ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22));
		ac20 = vis_fpadd32(ac20, ac21);
		ac22 = vis_fpadd32(ac22, ac23);
		ac20 = vis_fpadd32(ac10, ac20);
		ac22 = vis_fpadd32(ac12, ac22);

		dp0[0] = vis_fpackfix_pair(ac00, ac02);
		dp1[0] = vis_fpackfix_pair(ac20, ac22);

		dp0 += 2;
		dp1 += 2;
		d00 = d02;
		d01 = d03;
		d10 = d12;
		d11 = d13;
		d20 = d22;
		d21 = d23;
	}

	dp0 = (mlib_d64 *)dst0;
	dp1 = (mlib_d64 *)dst1;
	sp0 = (mlib_d64 *)src0;
	sp1 = (mlib_d64 *)src1;
	sp2 = (mlib_d64 *)src2;
	d01 = vis_ld_d64_nf(sp0);
	d11 = vis_ld_d64_nf(sp1);
	d21 = vis_ld_d64_nf(sp2);
	sp0++;
	sp1++;
	sp2++;
	d00 = vis_faligndata(d00, d01);
	d10 = vis_faligndata(d10, d11);
	d20 = vis_faligndata(d20, d21);

#pragma pipeloop(0)
	for (i = 0; i <= n - 4; i += 4) {
		d03 = vis_ld_d64_nf(sp0);
		d13 = vis_ld_d64_nf(sp1);
		d23 = vis_ld_d64_nf(sp2);
		sp0++;
		sp1++;
		sp2++;
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12));
		ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11));
		ac14 = vis_fpadd32(ac14, d87);
		ac16 = vis_fpadd32(ac16, d87);
		ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11));
		ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12));
		ac14 = vis_fpadd32(ac14, ac15);
		ac16 = vis_fpadd32(ac16, ac17);

		ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02));
		ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01));
		ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01));
		ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02));
		ac04 = vis_fpadd32(ac04, ac05);
		ac06 = vis_fpadd32(ac06, ac07);
		ac04 = vis_fpadd32(ac14, ac04);
		ac06 = vis_fpadd32(ac16, ac06);

		ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22));
		ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21));
		ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21));
		ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22));
		ac24 = vis_fpadd32(ac24, ac25);
		ac26 = vis_fpadd32(ac26, ac27);
		ac24 = vis_fpadd32(ac14, ac24);
		ac26 = vis_fpadd32(ac16, ac26);

		dp0[1] = vis_fpackfix_pair(ac04, ac06);
		dp1[1] = vis_fpackfix_pair(ac24, ac26);

		dp0 += 2;
		dp1 += 2;
		d00 = d02;
		d01 = d03;
		d10 = d12;
		d11 = d13;
		d20 = d22;
		d21 = d23;
	}

	if ((mlib_s16 *)dp0 <= dend) {
		d02 = vis_faligndata(d01, d03);
		d12 = vis_faligndata(d11, d13);
		d22 = vis_faligndata(d21, d23);

		ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10));
		ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11));
		ac10 = vis_fpadd32(ac10, d87);
		ac12 = vis_fpadd32(ac12, d87);
		ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11));
		ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12));
		ac10 = vis_fpadd32(ac10, ac11);
		ac12 = vis_fpadd32(ac12, ac13);

		ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00));
		ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01));
		ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01));
		ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02));
		ac00 = vis_fpadd32(ac00, ac01);
		ac02 = vis_fpadd32(ac02, ac03);
		ac00 = vis_fpadd32(ac10, ac00);
		ac02 = vis_fpadd32(ac12, ac02);

		ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20));
		ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21));
		ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21));
		ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22));
		ac20 = vis_fpadd32(ac20, ac21);
		ac22 = vis_fpadd32(ac22, ac23);
		ac20 = vis_fpadd32(ac10, ac20);
		ac22 = vis_fpadd32(ac12, ac22);

		ac00 = vis_fpackfix_pair(ac00, ac02);
		ac20 = vis_fpackfix_pair(ac20, ac22);
		emask = vis_edge16(dp0, dend);
		vis_pst_16(ac00, dp0, emask);
		vis_pst_16(ac20, dp1, emask);
		dp0++;
		dp1++;

		if ((mlib_s16 *)dp0 <= dend) {
			ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12));
			ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11));
			ac14 = vis_fpadd32(ac14, d87);
			ac16 = vis_fpadd32(ac16, d87);
			ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11));
			ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12));
			ac14 = vis_fpadd32(ac14, ac15);
			ac16 = vis_fpadd32(ac16, ac17);

			ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02));
			ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01));
			ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01));
			ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02));
			ac04 = vis_fpadd32(ac04, ac05);
			ac06 = vis_fpadd32(ac06, ac07);
			ac04 = vis_fpadd32(ac14, ac04);
			ac06 = vis_fpadd32(ac16, ac06);

			ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22));
			ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21));
			ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21));
			ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22));
			ac24 = vis_fpadd32(ac24, ac25);
			ac26 = vis_fpadd32(ac26, ac27);
			ac24 = vis_fpadd32(ac14, ac24);
			ac26 = vis_fpadd32(ac16, ac26);

			ac04 = vis_fpackfix_pair(ac04, ac06);
			ac24 = vis_fpackfix_pair(ac24, ac26);
			emask = vis_edge16(dp0, dend);
			vis_pst_16(ac04, dp0, emask);
			vis_pst_16(ac24, dp1, emask);
		}
	}

	dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4;
	dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4;
	dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4;
	dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4;

	return (MLIB_SUCCESS);
}
예제 #9
0
mlib_status
__mlib_VectorSubS_S16C_S16C_Mod(
	mlib_s16 *z,
	const mlib_s16 *x,
	const mlib_s16 *c,
	mlib_s32 n)
{
	mlib_d64 *dpz, *dpx;
	mlib_d64 dx, dz, dx0, dx1;
	mlib_s16 *pz, *px, *pzend;

/* offset of address alignment in destination */
	mlib_s32 off;

/* edge masks */
	mlib_s32 emask;
	mlib_s32 len = n + n, i;

/* rest and leng in terms of 8 bytes. */
	mlib_s32 rest_8, even_8;
	mlib_u16 uc0 = *((mlib_s16 *)c);
	mlib_u16 uc1 = *((mlib_s16 *)c + 1);
	mlib_d64 dc = ((mlib_addr)z & 2) ? vis_to_double_dup((uc1 << 16) | uc0)
		: vis_to_double_dup((uc0 << 16) | uc1);

	if (n <= 0)
		return (MLIB_FAILURE);

	px = (mlib_s16 *)x;
	pz = (mlib_s16 *)z;

	dpz = (mlib_d64 *)((mlib_addr)z & (~7));
	off = (mlib_addr)dpz - (mlib_addr)z;
	pzend = pz + n + n - 1;
/*
 * generate edge mask for the start point
 */
	emask = vis_edge16(pz, pzend);

/*
 * prepare the destination address
 */

	if (off) {
		dpx = (mlib_d64 *)vis_alignaddr(px, off);
		dx0 = vis_ld_d64_nf(dpx);
		dx1 = vis_ld_d64_nf(dpx + 1);
		dx = vis_faligndata(dx0, dx1);
		SUBS16_MOD;
		px += (8 + off) >> 1;
		len -= (8 + off) >> 1;
		dpz++;
	}

	if (len <= 0)
		return (MLIB_SUCCESS);

	even_8 = len >> 2;
	rest_8 = len & 0x3;
	emask = 0xf;

/*
 * Now try to analyze source "x" and "y" addresses.
 */

	if (!((mlib_addr)px & 7)) {

		dpx = (mlib_d64 *)px;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx = (*dpx++);
			SUBS16_MOD;
			dpz++;
		}

		dx1 = vis_ld_d64_nf(dpx);
		dpx++;

	} else {

		dpx = vis_alignaddr(px, 0);
		dx0 = vis_ld_d64_nf(dpx);
		dpx++;

#pragma pipeloop(0)
		for (i = 0; i < even_8; i++) {
			dx1 = vis_ld_d64_nf(dpx);
			dpx++;
			dx = vis_faligndata(dx0, dx1);
			SUBS16_MOD;
			dx0 = dx1;
			dpz++;
		}

		dx1 = dx0;
	}

	if (!rest_8)
		return (MLIB_SUCCESS);

/*
 * prepare edge mask for the last bytes
 */

	emask = ~(vis_edge16((void *)(rest_8 << 1), pzend));

	vis_alignaddr(px, 0);
	dx0 = dx1;
	dx1 = vis_ld_d64_nf(dpx);
	dx = vis_faligndata(dx0, dx1);

	SUBS16_MOD;

	return (MLIB_SUCCESS);
}
예제 #10
0
mlib_status
mlib_v_ImageAdd_U16(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask, mask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr, tmp;
	mlib_d64 sd10, sd11, sd20, sd21;
	mlib_d64 ones = vis_to_double_dup(0x7fff7fff);
	mlib_d64 max_u16 = vis_to_double_dup(0xffffffff);
	mlib_u16 *dend;

	VALIDATE(mlib_u16);

/* initialize GSR scale factor */
	vis_write_gsr(15 << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16(sd10, sd20, dpp)
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {
예제 #11
0
mlib_status
mlib_v_ImageAdd_S16(
    mlib_image *dst,
    const mlib_image *src1,
    const mlib_image *src2)
{
	mlib_s32 i, j, k;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_s32 amount;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;

	VALIDATE(mlib_s16);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	amount = width * channels;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1);

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = vis_ld_d64_nf(spp1 + 1);
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10,
					vis_ld_d64_nf(spp1 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = vis_ld_d64_nf(spp1); spp1++;
			sd20 = vis_ld_d64_nf(spp2); spp2++;
			MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd10 = vis_ld_d64_nf(spp1); spp1++;
				sd20 = vis_ld_d64_nf(spp2); spp2++;
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {
/* common case */

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(vis_ld_d64_nf(spp1),
					vis_ld_d64_nf(spp1 + 1));
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(vis_ld_d64_nf(spp2),
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = vis_ld_d64_nf(spp1);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (amount - 4); k += 4) {
				sd10 = sd11;
				sd11 = vis_ld_d64_nf(spp1 + 1);
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1));

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = vis_ld_d64_nf(spp2);
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < amount) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20,
					vis_ld_d64_nf(spp2 + 1));
				MLIB_V_ADDIMAGE_S16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}

	return (MLIB_SUCCESS);
}
예제 #12
0
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_u16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + amount - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = vis_ld_d64_nf(spp2);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = vis_ld_d64_nf(spp2 + 1);
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp,
				    emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= amount - 4; i += 4) {
예제 #13
0
mlib_status
__mlib_SignalEmphasize_S16S_S16S_Sat(
    mlib_s16 *dst,
    const mlib_s16 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_emphasize_struct *fist = filter;
	mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF);
	mlib_d64 w_maskor0  = vis_freg_pair(0.f, fist->v16_last0);
	mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000);
	mlib_d64 w_maskor1  = vis_freg_pair(0.f, fist->v16_last1);
	mlib_f32 v_mask	    = vis_to_float(0x80008000);
	mlib_f32 v_alpha    = fist->v_alpha;
	mlib_s16 *fdst	    = dst + n + n - 1;
	mlib_d64 *dpd, *dps, *dsrct1;
	mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc;
	mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7;
	mlib_s32 i, times, t1, t2;

/* check for obvious errors */

	if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) ||
	    (fist->type != MLIB_EMPH)) {
		return (MLIB_FAILURE);
	}

	vis_write_gsr(1 << 3);
	w_maskor0 = vis_fand(w_maskor0, w_maskand1);
	w_maskor1 = vis_fand(w_maskor1, w_maskand0);

	vis_alignaddr((void *)(-(mlib_addr)src), 0);
	w_maskand0 = vis_faligndata(w_maskand0, w_maskand0);
	w_maskor0 = vis_faligndata(w_maskor0, w_maskor0);
	w_maskand1 = vis_faligndata(w_maskand1, w_maskand1);
	w_maskor1 = vis_faligndata(w_maskor1, w_maskor1);

	dpd = vis_alignaddr(dst, 0);
	times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd;
	t1 = -((mlib_addr)(dst) & 7);
	t2 = t1 - 4;
	dps = vis_alignaddr((void *)src, t2);
	w_src0 = vis_ld_d64_nf(dps);
	dps++;
	w_src1 = vis_ld_d64_nf(dps);
	dps++;

	if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) {
		if (((mlib_addr)dps - (mlib_addr)src) >= 6) {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand0, w_src1);
			w_src1 = vis_for(w_maskor0, w_src1);
		}

		if (((mlib_addr)dps - (mlib_addr)src) >= 8) {
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
		} else {
			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
		}

		w_lsrc = vis_faligndata(w_src0, w_src1);
		dsrct1 = vis_alignaddr((void *)src, t1);

		if (dps - 2 != dsrct1) {
			w_src2 = *dps;
			dps++;
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = *dps;
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = w_src2;
			w_src2 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src1, w_src2);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;
				w_src1 = w_src2;
				w_src2 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src1, w_src2);
				dpd++;
				dps++;
			}
		} else {
			w_src = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			if ((mlib_addr)dst & 7) {
				times--;
				w_src0 = w_src1;
				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;

				MLIB_MIX;

				w_dst = vis_fpackfix_pair(dr2, dr3);
				vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
				dpd++;
			}

			w_src0 = w_src1;

			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			MLIB_MIX;
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			w_src0 = w_src1;
			w_src1 = vis_ld_d64_nf(dps);
			vis_alignaddr((void *)src, t2);
			w_lsrc = vis_faligndata(w_src0, w_src1);
			vis_alignaddr((void *)src, t1);
			w_src = vis_faligndata(w_src0, w_src1);
			dps++;

			for (i = 0; i < times; i++) {
				*dpd = w_dst;
				MLIB_MIX;
				w_dst = vis_fpackfix_pair(dr2, dr3);
				w_src0 = w_src1;

				w_src1 = vis_ld_d64_nf(dps);
				vis_alignaddr((void *)src, t2);
				w_lsrc = vis_faligndata(w_src0, w_src1);
				vis_alignaddr((void *)src, t1);
				w_src = vis_faligndata(w_src0, w_src1);
				dps++;
				dpd++;
			}
		}
	} else {
		w_src = w_src1;

		if ((mlib_addr)src & 7) {
			times--;

			if (((mlib_addr)src & 7) == 2) {
				w_src0 = vis_fand(w_maskand0, w_src0);
				w_src0 = vis_for(w_maskor0, w_src0);
			} else {
				w_src1 = vis_fand(w_maskand0, w_src1);
				w_src1 = vis_for(w_maskor0, w_src1);
			}

			w_src1 = vis_fand(w_maskand1, w_src1);
			w_src1 = vis_for(w_maskor1, w_src1);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;

			w_src0 = w_src1;
			w_src1 = *dps;
			w_src = w_src1;
			w_lsrc = vis_faligndata(w_src0, w_src1);
			dps++;

			MLIB_MIX;

			w_dst = vis_fpackfix_pair(dr2, dr3);
			vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst));
			dpd++;
		} else {
			w_src0 = vis_fand(w_maskand0, w_src0);
			w_src0 = vis_for(w_maskor0, w_src0);
			w_src0 = vis_fand(w_maskand1, w_src0);
			w_src0 = vis_for(w_maskor1, w_src0);
			w_lsrc = vis_faligndata(w_src0, w_src1);

			MLIB_MUL8;
		}

		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);

		MLIB_MIX;

		w_src1 = w_src;
		w_dst = vis_fpackfix_pair(dr2, dr3);
		dps++;
		w_src = vis_ld_d64_nf(dps);
		w_lsrc = vis_faligndata(w_src1, w_src);
		dps++;

		for (i = 0; i < times; i++) {
			*dpd = w_dst;

			MLIB_MIX;

			w_src1 = w_src;
			w_src = vis_ld_d64_nf(dps);
			w_lsrc = vis_faligndata(w_src1, w_src);
			w_dst = vis_fpackfix_pair(dr2, dr3);
			dps++;
			dpd++;

		}
	}

	if (times >= 0) {
		vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst));
	}
	((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2];
	((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1];

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUp_S16_S16_3_D1(
    const mlib_s16 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 *table0,
    const mlib_s16 *table1,
    const mlib_s16 *table2)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* pointer to start of destination */
	mlib_s16 *dl;

/* pointer to end of destination */
	mlib_s16 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2, t3;

/* destination data */
	mlib_d64 acc0, acc1;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;
	const mlib_s16 *table;

	dl = dst;
	sp = (void *)src;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 6);

	i = 0;

	if (xsize >= 4) {

		s0 = sp[0] << 1;
		s1 = sp[1] << 1;
		s2 = sp[2] << 1;
		s3 = sp[3] << 1;
		sp += 4;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
			t3 = VIS_LD_U16_I(table0, s3);
			t2 = VIS_LD_U16_I(table2, s2);
			t1 = VIS_LD_U16_I(table1, s1);
			t0 = VIS_LD_U16_I(table0, s0);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0] << 1;
			s1 = sp[1] << 1;
			s2 = sp[2] << 1;
			s3 = sp[3] << 1;
			(*dp++) = vis_bshuffle(acc0, acc1);
			table = table0;
			table0 = table1;
			table1 = table2;
			table2 = table;
		}

		t3 = VIS_LD_U16_I(table0, s3);
		t2 = VIS_LD_U16_I(table2, s2);
		t1 = VIS_LD_U16_I(table1, s1);
		t0 = VIS_LD_U16_I(table0, s0);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
		table = table0;
		table0 = table1;
		table1 = table2;
		table2 = table;
		i += 4;
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_s16 *)dend - (mlib_s16 *)dp;
		sp += num;
		num++;

		if (num == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		} else if (num == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table2, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table1, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U16_I(table0, s0 << 1);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge16(dp, dend);
		vis_pst_16(acc0, dp, emask);
	}
}
예제 #15
0
mlib_status
mlib_ImageAbs_S16(
    mlib_s16 *dst,
    mlib_s16 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
{
/* 8-byte aligned src, dst ptrs */
	mlib_d64 *sp, *dp;

/* unaligned data */
	mlib_d64 prev;
	mlib_d64 curr0;
	mlib_d64 curr1, curr2;

/* aligned data */
	mlib_d64 adat0;

/* absolute values of result */
	mlib_d64 dabs;

/* aligned data */
	mlib_d64 adat1, adat2, adat3;

/* absolute values of result */
	mlib_d64 dabs0, dabs1, dabs2, dabs3;
	mlib_d64 dtwo = vis_to_double_dup(0x20002);
	mlib_d64 mask = vis_to_double_dup(0x80008000);

/* last pixel of line */
	mlib_s16 *dlast;

/* bit mask results of comp */
	mlib_s32 mask0;

/* pxl count of source line */
	mlib_s32 slpxl = slb >> 1;

/* pxl count of destination line */
	mlib_s32 dlpxl = dlb >> 1;

/* dst offset for address alignment */
	mlib_s32 doffs;
	mlib_s32 row, block;

/* full blocks, each of N d64s */
	mlib_s32 numblocks;

	for (row = 0; row < hgt; row++) {

/* ROW SETUP */

/* last dst pixel in row */
		dlast = dst + wid - 1;
		doffs = (mlib_addr)dst & 7;
/* aligned dest ptr */
		dp = (mlib_d64 *)((mlib_addr)dst & ~7);
/* aligned src ptr */
		sp = (mlib_d64 *)vis_alignaddr(src, -(mlib_s32)doffs);
		prev = *sp;

/* FIRST d64 NEEDS EDGE MASK FOR DESTINATION START POINT */

/* edge mask for start point */
		mask0 = vis_edge16(dst, dlast);
		READ_PXLS_UNALIGN;
		CALC_ABS_S16;
		vis_pst_16(dabs, dp++, mask0);
		numblocks = ((mlib_u8 *)dlast + 1 - (mlib_u8 *)dp) >> 3;

/* DO MOST OF ROW IN BLOCKS OF N d64s */

		if ((((mlib_addr)src ^ (mlib_addr)dst) & 7) == 0) {
#pragma pipeloop(0)
			for (block = 0; block < numblocks - 3; block += 4) {
				adat0 = sp[0];
				adat1 = sp[1];
				adat2 = sp[2];

				CALC_ABS_S16_UNROLL(dabs0, adat0);
				adat3 = sp[3];
				dp[0] = dabs0;
				CALC_ABS_S16_UNROLL(dabs1, adat1);
				dp[1] = dabs1;
				CALC_ABS_S16_UNROLL(dabs2, adat2);
				dp[2] = dabs2;
				CALC_ABS_S16_UNROLL(dabs3, adat3);
				dp[3] = dabs3;

				sp += 4;
				dp += 4;
			}

#pragma pipeloop(0)
			for (; block < numblocks; block++) {
				READ_PXLS_ALIGN;
				CALC_ABS_S16;
				STORE_ABS_VALUES;
			}

			prev = *sp;
		} else {
#pragma pipeloop(0)
			for (block = 0; block < numblocks - 3; block += 4) {
				curr0 = *(sp + 1);
				curr1 = *(sp + 2);
				curr2 = *(sp + 3);

				adat0 = vis_faligndata(prev, curr0);
				prev = *(sp + 4);
				CALC_ABS_S16_UNROLL(dabs0, adat0);
				dp[0] = dabs0;
				adat1 = vis_faligndata(curr0, curr1);
				CALC_ABS_S16_UNROLL(dabs1, adat1);
				dp[1] = dabs1;
				adat2 = vis_faligndata(curr1, curr2);
				CALC_ABS_S16_UNROLL(dabs2, adat2);
				dp[2] = dabs2;
				adat3 = vis_faligndata(curr2, prev);
				CALC_ABS_S16_UNROLL(dabs3, adat3);
				dp[3] = dabs3;

				sp += 4;
				dp += 4;
			}

#pragma pipeloop(0)
			for (; block < numblocks; block++) {
				READ_PXLS_UNALIGN;
				CALC_ABS_S16;
				STORE_ABS_VALUES;
			}
		}

/* LAST d64 NEEDS EDGE MASK FOR DESTINATION END POINT */

		if ((mlib_addr)dp <= (mlib_addr)dlast) {
			curr0 = *(++sp);
/* edge mask for end point */
			mask0 = vis_edge16(dp, dlast);
			adat0 = vis_faligndata(prev, curr0);
			CALC_ABS_S16;
			vis_pst_16(dabs, dp, mask0);
		}

/* ptrs to next src row */
		src += slpxl;
/* ptrs to next dst row */
		dst += dlpxl;
	}

	return (MLIB_SUCCESS);
}
예제 #16
0
void
mlib_v_ImageMulShift_S16(
    mlib_s16 *sp1,
    mlib_s32 stride1,
    mlib_s16 *sp2,
    mlib_s32 stride2,
    mlib_s16 *dp,
    mlib_s32 strided,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 shift)
{
/* pointers for line of source1 */
	mlib_s16 *sl1;

/* pointers for line of source2 */
	mlib_s16 *sl2;

/* pointers for line of dst */
	mlib_s16 *dl;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;
	mlib_d64 rdhh, rdhl;
	mlib_d64 rdlh, rdll;
	mlib_d64 rdh, rdl;
	mlib_s32 i, j, k;

	if (width == stride1 && width == stride2 && width == strided) {
		width *= height;
		height = 1;
	}

/* initialize GSR scale factor */
	vis_write_gsr(((16 - shift) & 0x1f) << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = spp2[0];

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = spp1[0];

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10, spp1[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = (*spp1++);
			sd20 = (*spp2++);
			MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(spp1[0], spp1[1]);
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(spp2[0], spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = spp1[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (width - 4); k += 4) {
				sd10 = sd11;
				sd11 = spp1[1];
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, spp1[1]);

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = spp2[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFT_S16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}
}
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src,
                                           mlib_u8        *dst,
                                           mlib_s32       xsize,
                                           const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_u16 *dl;                        /* pointer to start of destination */
  mlib_u16 *dend;                      /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = (mlib_u16 *) dst;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab1, s3);
      t6 = VIS_LD_U8_I(tab0, s3);
      t5 = VIS_LD_U8_I(tab1, s2);
      t4 = VIS_LD_U8_I(tab0, s2);
      t3 = VIS_LD_U8_I(tab1, s1);
      t2 = VIS_LD_U8_I(tab0, s1);
      t1 = VIS_LD_U8_I(tab1, s0);
      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab1, s3);
    t6 = VIS_LD_U8_I(tab0, s3);
    t5 = VIS_LD_U8_I(tab1, s2);
    t4 = VIS_LD_U8_I(tab0, s2);
    t3 = VIS_LD_U8_I(tab1, s1);
    t2 = VIS_LD_U8_I(tab0, s1);
    t1 = VIS_LD_U8_I(tab1, s0);
    t0 = VIS_LD_U8_I(tab0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16 *) dend - (mlib_u16 *) dp;
    sp += num;
    num++;
#pragma pipeloop(0)
    for (i = 0; i < num; i++) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t0, acc);

      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc, dp, emask);
  }
}
예제 #18
0
mlib_status
__mlib_VectorAdd_S16_S16_Sat(
    mlib_s16 *z,
    const mlib_s16 *x,
    const mlib_s16 *y,
    mlib_s32 n)
{
    mlib_d64 *dpz, *dpx, *dpy;
    mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1, dr0, dr1, dr2;
    mlib_s16 *pz, *px, *py, *pzend;

    /* offset of address alignment in destination */
    mlib_s32 off;

    /* edge masks */
    mlib_s32 emask;
    mlib_s32 mask1, mask2;
    mlib_s32 ovl, und;
    mlib_d64 fzero = vis_fzero();
    mlib_d64 const_ovl = vis_to_double_dup(0x7fff7fff);
    mlib_d64 const_und = vis_fnot(const_ovl);
    mlib_s32 len = n, i;

    /* rest and leng in terms of 8 bytes. */
    mlib_s32 rest_8, even_8;

    if (n <= 0)
        return (MLIB_FAILURE);

    px = (mlib_s16 *)x;
    py = (mlib_s16 *)y;
    pz = (mlib_s16 *)z;

    dpz = (mlib_d64 *)((mlib_addr)z & (~7));
    off = (long)dpz - (long)z;
    pzend = pz + n - 1;
    /*
     * generate edge mask for the start point
     */
    emask = vis_edge16(pz, pzend);

    /*
     * prepare the destination address
     */

    if (off) {
        dpy = (mlib_d64 *)vis_alignaddr(py, off);
        dy0 = vis_ld_d64_nf(dpy);
        dy1 = vis_ld_d64_nf(dpy + 1);
        dy = vis_faligndata(dy0, dy1);
        dpx = (mlib_d64 *)vis_alignaddr(px, off);
        dx0 = vis_ld_d64_nf(dpx);
        dx1 = vis_ld_d64_nf(dpx + 1);
        dx = vis_faligndata(dx0, dx1);
        ADD16_SAT;

        px += (8 + off) >> 1;
        py += (8 + off) >> 1;
        len -= (8 + off) >> 1;
        dpz++;
    }

    if (len <= 0)
        return (MLIB_SUCCESS);

    even_8 = len >> 2;
    rest_8 = len & 0x3;
    emask = 0xf;

    /*
     * Now try to analyze source "x" and "y" addresses.
     */

    if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) {

        /*
         * Both addresses are 8-byte aligned. No  vis_alignaddr
         * and  vis_faligndata at all.
         */

        dpx = (mlib_d64 *)px;
        dpy = (mlib_d64 *)py;

        dx = vis_ld_d64_nf(dpx);
        dpx++;
        dy = vis_ld_d64_nf(dpy);
        dpy++;
        dx1 = vis_ld_d64_nf(dpx);
        dy1 = vis_ld_d64_nf(dpy);

        for (i = 0; i < even_8; i++) {
            ADD16_SAT;
            dx = dx1;
            dy = dy1;
            dpx++;
            dpy++;
            dpz++;
            dx1 = vis_ld_d64_nf(dpx);
            dy1 = vis_ld_d64_nf(dpy);
        }

        dx1 = dx;
        dy1 = dy;
    } else if ((!((mlib_addr)px & 7))) {

        /*
         * First ("x") address is 8-byte aligned. vis_alignaddr
         * and vis_faligndata only for "y".
         */

#pragma unroll(1)
        /*
         * 11111
         */
        dpx = (mlib_d64 *)px;
        dx = vis_ld_d64_nf(dpx);
        dpx++;

        dpy = vis_alignaddr(py, 0);
        dy0 = vis_ld_d64_nf(dpy);
        dpy++;
        dy1 = vis_ld_d64_nf(dpy);
        dy = vis_faligndata(dy0, dy1);

        for (i = 0; i < even_8; i++) {
            ADD16_SAT;
            dx = vis_ld_d64_nf(dpx);
            dy0 = dy1;
            dy1 = vis_ld_d64_nf(dpy + 1);
            dy = vis_faligndata(dy0, dy1);
            dpz++;
            dpx++;
            dpy++;
        }

        dx1 = dx;
        dy1 = dy0;
    } else if ((!((mlib_addr)py & 7))) {

        /*
         * Second ("y") address is 8-byte aligned. vis_alignaddr
         * and vis_faligndata only for "x".
         */

        dpy = (mlib_d64 *)py;

        dpx = vis_alignaddr(px, 0);
        dx1 = vis_ld_d64_nf(dpx);
        dpx++;

        for (i = 0; i < even_8; i++) {
            dy = *dpy;
            dx0 = dx1;
            dx1 = vis_ld_d64_nf(dpx);
            dx = vis_faligndata(dx0, dx1);
            ADD16_SAT;
            dpx++;
            dpy++;
            dpz++;
        }

        dy1 = vis_ld_d64_nf(dpy);
        dpy++;

#pragma unroll(8)
    } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) {

        /*
         * Both ("x" and "y") address are identically aligned.
         * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop.
         */

        dpx = vis_alignaddr(px, 0);
        dx1 = vis_ld_d64_nf(dpx);
        dpx++;
        dpy = vis_alignaddr(py, 0);
        dy1 = vis_ld_d64_nf(dpy);
        dpy++;

        for (i = 0; i < even_8; i++) {
            dy0 = dy1;
            dy1 = vis_ld_d64_nf(dpy);
            dpy++;
            dy = vis_faligndata(dy0, dy1);
            dx0 = dx1;
            dx1 = vis_ld_d64_nf(dpx);
            dpx++;
            dx = vis_faligndata(dx0, dx1);
            ADD16_SAT;
            dpz++;
        }
    } else {

        /*
         * Both ("x" and "y") address are arbitrary aligned.
         * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop.
         */

        dpy = vis_alignaddr(py, 0);
        dy0 = vis_ld_d64_nf(dpy);
        dpy++;
        dy1 = vis_ld_d64_nf(dpy);
        dy = vis_faligndata(dy0, dy1);
        dy0 = dy1;
        dy1 = vis_ld_d64_nf(dpy + 1);

        dpx = vis_alignaddr(px, 0);
        dx0 = vis_ld_d64_nf(dpx);
        dpx++;
        dx1 = vis_ld_d64_nf(dpx);
        dx = vis_faligndata(dx0, dx1);
        dx0 = dx1;
        dx1 = vis_ld_d64_nf(dpx + 1);

        for (i = 0; i < even_8; i++) {
            ADD16_SAT;
            vis_alignaddr(py, (mlib_addr)dpy);
            dy = vis_faligndata(dy0, dy1);
            vis_alignaddr(px, (mlib_addr)dpx);
            dx = vis_faligndata(dx0, dx1);
            dpz++;
            dpy++;
            dpx++;
            dy0 = dy1;
            dy1 = vis_ld_d64_nf(dpy + 1);
            dx0 = dx1;
            dx1 = vis_ld_d64_nf(dpx + 1);
        }

        dx1 = dpx[-1];
        dy1 = dpy[-1];
    }

    if (!rest_8)
        return (MLIB_SUCCESS);

    /*
     * prepare edge mask for the last bytes
     */

    emask = ~(vis_edge16((void *)(rest_8 << 1), pzend));

    vis_alignaddr(px, 0);
    dx0 = dx1;
    dx1 = vis_ld_d64_nf(dpx);
    dx = vis_faligndata(dx0, dx1);
    vis_alignaddr(py, 0);
    dy0 = dy1;
    dy1 = vis_ld_d64_nf(dpy);
    dy = vis_faligndata(dy0, dy1);

    ADD16_SAT;

    return (MLIB_SUCCESS);
}