Exemple #1
0
void
mlib_v_ImageNot_na(
    mlib_u8 *sa,
    mlib_u8 *da,
    mlib_s32 size)
{
/* end points in dst */
	mlib_u8 *dend;

/* 8-byte aligned start points in dst */
	mlib_d64 *dp;

/* 8-byte aligned start point in src */
	mlib_d64 *sp;

/* 8-byte source data */
	mlib_d64 s0, s1;

/* offset of address in dst */
	mlib_s32 j;

/* edge mask */
	mlib_s32 emask;

/* prepare the destination addresses */
	dp = (mlib_d64 *)((mlib_addr)da & (~7));
	j = (mlib_addr)dp - (mlib_addr)da;
	dend = da + size - 1;

/* prepare the source address */
	sp = (mlib_d64 *)vis_alignaddr(sa, j);
/* generate edge mask for the start point */
	emask = vis_edge8(da, dend);

	s1 = vis_ld_d64_nf(sp);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		s0 = vis_faligndata(s0, s1);
		vis_pst_8(vis_fnot(s0), dp++, emask);
		sp++;
		j += 8;
	}
#pragma pipeloop(0)
	for (; j <= (size - 8); j += 8) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sp + 1);
		(*dp++) = vis_fnot(vis_faligndata(s0, s1));
		sp++;
	}

	if (j < size) {
		s0 = vis_faligndata(s1, vis_ld_d64_nf(sp + 1));
		emask = vis_edge8(dp, dend);
		vis_pst_8(vis_fnot(s0), dp, emask);
	}
}
void
mlib_ImageLineXor8000(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_d64 *dptr;
	mlib_d64 *sptr;
	mlib_d64 s0, s1;
	mlib_d64 mask8000 = vis_to_double_dup(0x80008000);
	mlib_s32 j;
	mlib_s32 emask;

/* prepare the destination addresses */
	dptr = (mlib_d64 *)((mlib_addr)dst & (~7));
	j = (mlib_addr)dptr - (mlib_addr)dst;
	dend = (mlib_u8 *)dst + size - 1;

/* prepare the source address */
	sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j);
/* generate edge mask for the start point */
	emask = vis_edge8(dst, dend);

	s1 = vis_ld_d64_nf(sptr);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		s0 = vis_fxor(vis_faligndata(s0, s1), mask8000);
		vis_pst_8(s0, dptr++, emask);
		sptr++;
		j += 8;
	}

#pragma pipeloop(0)
	for (; j <= (size - 16); j += 8) {
		s0 = s1;
		s1 = sptr[1];
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
	}

	if (j <= (size - 8)) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
		j += 8;
	}

	if (j < size) {
		s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)),
		    mask8000);
		emask = vis_edge8(dptr, dend);
		vis_pst_8(s0, dptr, emask);
	}
}
mlib_status
__mlib_VideoColorSplit3_S16(
	mlib_s16 *color1,
	mlib_s16 *color2,
	mlib_s16 *color3,
	const mlib_s16 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 *dp2 = (mlib_d64 *)color3;
	mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3;
	mlib_s32 i;

	vis_write_gsr(4);
	vis_write_bmask(0x02CE13DF, 0);
#pragma pipeloop(0)
#pragma unroll(4)
	for (i = 0; i <= (n - 4); i += 4) {
		sd0 = sp[0];
		sd1 = sp[1];
		sd2 = sp[2];
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		(*dp0++) = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		(*dp1++) = vis_bshuffle(dd3, dd3);
		(*dp2++) = vis_bshuffle(dd1, dd2);
		sp += 3;
	}

/*
 * last 4 pixels
 */

	if (i < n) {
		mlib_s32 emask = 0xF0 >> (n & 3);
		mlib_d64 st0, st1, st2;

		sd0 = sp[0];
		sd1 = vis_ld_d64_nf(sp + 1);
		sd2 = vis_ld_d64_nf(sp + 2);
		dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2));
		dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1));
		st0 = vis_bshuffle(dd0, dd1);
		dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2));
		dd3 = vis_faligndata(dd0, dd2);
		st1 = vis_bshuffle(dd3, dd3);
		st2 = vis_bshuffle(dd1, dd2);
		vis_pst_16(st0, dp0, emask);
		vis_pst_16(st1, dp1, emask);
		vis_pst_16(st2, dp2, emask);
	}
void mlib_v_ImageLookUpSI_S32_S16_4_DstOff2_D1(mlib_s32 *src,
                                               mlib_s16 *dst,
                                               mlib_s32 xsize,
                                               mlib_s16 **table)
{
  mlib_s32 *sp;              /* pointer to source data */
  mlib_s32 s0, s1;           /* source data */
  mlib_s16 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 acc;              /* destination data */
  mlib_s32 i;                /* loop variable */
  mlib_s16 *tab0 = &table[0][(mlib_u32)2147483648];
  mlib_s16 *tab1 = &table[1][(mlib_u32)2147483648];
  mlib_s16 *tab2 = &table[2][(mlib_u32)2147483648];
  mlib_s16 *tab3 = &table[3][(mlib_u32)2147483648];

  sp   = src;
  dl   = dst;
  dp   = (mlib_d64 *) dl;

  vis_alignaddr((void *) 0, 6);

  s0 = *sp++;

  if (xsize >= 1) {

    s1 = *sp++;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 2; i++) {
      t3 = vis_ld_u16_i(tab1, ((mlib_addr)2*s1));
      t2 = vis_ld_u16_i(tab0, ((mlib_addr)2*s1));
      t1 = vis_ld_u16_i(tab3, ((mlib_addr)2*s0));
      t0 = vis_ld_u16_i(tab2, ((mlib_addr)2*s0));
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = s1;
      s1 = *sp++;
      *dp++ = acc;
    }
    t3 = vis_ld_u16_i(tab1, ((mlib_addr)2*s1));
    t2 = vis_ld_u16_i(tab0, ((mlib_addr)2*s1));
    t1 = vis_ld_u16_i(tab3, ((mlib_addr)2*s0));
    t0 = vis_ld_u16_i(tab2, ((mlib_addr)2*s0));
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    s0 = s1;
    *dp++ = acc;
  }

  dl = (mlib_s16*)dp;

  dl[0] = tab2[s0];
  dl[1] = tab3[s0];
}
mlib_status
__mlib_VideoColorSplit2_S16(
	mlib_s16 *color1,
	mlib_s16 *color2,
	const mlib_s16 *colors,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)colors;
	mlib_d64 *dp0 = (mlib_d64 *)color1;
	mlib_d64 *dp1 = (mlib_d64 *)color2;
	mlib_d64 sd0, sd1, dd0, dd1, dd2, dd3;
	mlib_s32 i;

	vis_write_gsr64(((mlib_u64)0x014589cd << 32) | 2);
/*
 * 8-pixels loop
 */
	sd0 = sp[0];
	sd1 = vis_ld_d64_nf(sp + 1);
	dd0 = vis_faligndata(sd0, sd1);
	dd1 = vis_faligndata(sd1, sd0);
	dd2 = vis_bshuffle(sd0, sd1);
	dd3 = vis_bshuffle(dd0, dd1);
	sd0 = vis_ld_d64_nf(sp + 2);
	sd1 = vis_ld_d64_nf(sp + 3);
	dd0 = vis_faligndata(sd0, sd1);
	dd1 = vis_faligndata(sd1, sd0);
#pragma pipeloop(0)
	for (i = 0; i < (n / 4); i++) {
		(*dp0++) = dd2;
		(*dp1++) = dd3;
		dd2 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(dd0, dd1);
		sd0 = vis_ld_d64_nf(sp + 4);
		sd1 = vis_ld_d64_nf(sp + 5);
		dd0 = vis_faligndata(sd0, sd1);
		dd1 = vis_faligndata(sd1, sd0);
		sp += 2;
	}

/*
 * last 8 pixels
 */

	if (n & 3) {
		mlib_s32 emask = 0xF0 >> (n & 3);

		sd0 = sp[0];
		sd1 = vis_ld_d64_nf(sp + 1);
		dd0 = vis_faligndata(sd0, sd1);
		dd1 = vis_faligndata(sd1, sd0);
		dd2 = vis_bshuffle(sd0, sd1);
		dd3 = vis_bshuffle(dd0, dd1);
		vis_pst_16(dd2, (mlib_f32 *)dp0, emask);
		vis_pst_16(dd3, (mlib_f32 *)dp1, emask);
	}
void
mlib_v_ImageColorRGB2Mono_U8_A8D2X8(
    const mlib_u8 *src,
    mlib_s32 slb,
    mlib_u8 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize,
    const mlib_d64 *weight)
{
    mlib_d64 *sp, *dp;
    mlib_d64 *sl, *dl;
    mlib_d64 sd0, sd1, sd2, sd3;
    mlib_d64 dd;
    mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt;
    mlib_f32 d32, e32, alpha, gamma, beta;
    mlib_s32 i, j;
    mlib_s32 mask0 = 0x0369147a;
    mlib_s32 mask1 = 0x258b258b;

    sp = sl = (mlib_d64 *)src;
    dp = dl = (mlib_d64 *)dst;

    /* prepare the weight */
    alpha = vis_to_float(weight[0] * 8192);
    beta = vis_to_float(weight[1] * 8192);
    gamma = vis_to_float(weight[2] * 8192);
    vis_write_gsr((2 << 3) + 4);
    for (j = 0; j < ysize; j++) {
#pragma pipeloop(0)
        for (i = 0; i < xsize / 8; i++) {
            sd0 = (*sp++);
            sd1 = (*sp++);
            sd2 = (*sp++);
            sd3 = vis_faligndata(sd2, sd2);
            sd2 = vis_faligndata(sd1, sd2);
            CHANNELSEPARATE_U8_AL(sd0, sd1, sd2, sd3, rgdd0, bdd0,
                                  rgdd1, bdd1);
            CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
            (*dp++) = dd;
        }

        sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
        dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
    }
}
Exemple #7
0
void mlib_v_ImageLookUp_U8_U16_124_SrcOff0_D1(const mlib_u8  *src,
                                              mlib_u16       *dst,
                                              mlib_s32       xsize,
                                              const mlib_u16 *table0,
                                              const mlib_u16 *table1,
                                              const mlib_u16 *table2,
                                              const mlib_u16 *table3)
{
  mlib_u32 *sa;          /* aligned pointer to source data */
  mlib_u8  *sp;          /* pointer to source data */
  mlib_u32 s0;           /* source data */
  mlib_u16 *dl;          /* pointer to start of destination */
  mlib_u16 *dend;        /* pointer to end of destination */
  mlib_d64 *dp;          /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;   /* destination data */
  mlib_d64 t3, acc0;     /* destination data */
  mlib_s32 emask;        /* edge mask */
  mlib_s32 i, num;       /* loop variable */

  sa   = (mlib_u32*)src;
  dl   = dst;
  dp   = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *) 0, 6);

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4) {
      t3 = VIS_LD_U16_I(table3, (s0 << 1) & 0x1FE);
      t2 = VIS_LD_U16_I(table2, (s0 >> 7) & 0x1FE);
      t1 = VIS_LD_U16_I(table1, (s0 >> 15) & 0x1FE);
      t0 = VIS_LD_U16_I(table0, (s0 >> 23) & 0x1FE);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      s0 = *sa++;
      *dp++ = acc0;
    }

    t3 = VIS_LD_U16_I(table3, (s0 << 1) & 0x1FE);
    t2 = VIS_LD_U16_I(table2, (s0 >> 7) & 0x1FE);
    t1 = VIS_LD_U16_I(table1, (s0 >> 15) & 0x1FE);
    t0 = VIS_LD_U16_I(table0, (s0 >> 23) & 0x1FE);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    *dp++ = acc0;
  }
void mlib_v_ImageLookUpSI_S16_U16_4_DstOff0_D1(const mlib_s16 *src,
                                               mlib_u16       *dst,
                                               mlib_s32       xsize,
                                               const mlib_u16 **table)
{
  mlib_s16 *sp;              /* pointer to source data */
  mlib_s32 s0;               /* source data */
  mlib_u16 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 acc;              /* destination data */
  mlib_s32 i;                /* loop variable */
  const mlib_u16 *tab0 = &table[0][32768];
  const mlib_u16 *tab1 = &table[1][32768];
  const mlib_u16 *tab2 = &table[2][32768];
  const mlib_u16 *tab3 = &table[3][32768];

  sp   = (void *)src;
  dl   = dst;
  dp   = (mlib_d64 *) dl;

  vis_alignaddr((void *) 0, 6);

  if (xsize >= 1) {

    s0 = (*sp++) << 1;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 2; i++) {
      t3 = VIS_LD_U16_I(tab3, s0);
      t2 = VIS_LD_U16_I(tab2, s0);
      t1 = VIS_LD_U16_I(tab1, s0);
      t0 = VIS_LD_U16_I(tab0, s0);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = (*sp++) << 1;
      *dp++ = acc;
    }

    t3 = VIS_LD_U16_I(tab3, s0);
    t2 = VIS_LD_U16_I(tab2, s0);
    t1 = VIS_LD_U16_I(tab1, s0);
    t0 = VIS_LD_U16_I(tab0, s0);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }
}
static void FUNC(
    m3) (
    FUNC_M_ARG)
{
	mlib_s32 i;
	mlib_d64 k0 = pkern[0];
	mlib_d64 k1 = pkern[1];
	mlib_d64 k2 = pkern[2];
	mlib_d64 a0, a1, a2, aa, sum;
	mlib_d64 *perror = vis_alignaddr(perror1, 0);

	a0 = (*perror++);

	for (i = 0; i < sw; i++) {
		aa = (*perror++);
		a1 = vis_faligndata(a0, aa);
		a2 = vis_faligndata(a1, vis_faligndata(aa, aa));
		sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0));
		sum = vis_fpadd16(sum, FMUL_16x16(k1, a1));
		buffd[i] = vis_fpadd16(sum, FMUL_16x16(k2, a2));
		a0 = aa;
	}
}
static void FUNC(
    m1) (
    FUNC_M_ARG)
{
	mlib_s32 i;
	mlib_d64 k0 = pkern[0];
	mlib_d64 a0, e0, e1;
	mlib_d64 *perror = vis_alignaddr(perror1, 0);

	e0 = (*perror++);

	for (i = 0; i < (sw + 3) / 4; i++) {
		e1 = (*perror++);
		a0 = vis_faligndata(e0, e1);
		buffd[i] = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0));
		e0 = e1;
	}
}
Exemple #11
0
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, mask;
    mlib_s32 i, i0, j;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    mask = vis_to_double_dup(0xFF000000);
    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
	mlib_u32 *src = srcBase;
	mlib_u32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]);
	    dd = vis_faligndata(dd, dd);
	    *(mlib_d64*)(dst + i) = vis_for(dd, mask);
	}

	if (i < width) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	}
void
mlib_v_ImageLookUp_U8_U8_124_SrcOff0_D1(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 *table0,
    const mlib_u8 *table1,
    const mlib_u8 *table2,
    const mlib_u8 *table3)
{
/* aligned pointer to source data */
	mlib_u32 *sa;

/* pointer to source data */
	mlib_u8 *sp;

/* source data */
	mlib_u32 s0, s1;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0, acc1;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;

	sa = (mlib_u32 *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 8) {

		s0 = sa[0];
		s1 = sa[1];
		sa += 2;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sa += 2) {
			t7 = VIS_LD_U8_I(table3, s1 & 0xFF);
			t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF);
			t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF);
			t4 = VIS_LD_U8_I(table0, s1 >> 24);
			t3 = VIS_LD_U8_I(table3, s0 & 0xFF);
			t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF);
			t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF);
			t0 = VIS_LD_U8_I(table0, s0 >> 24);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sa[0];
			s1 = sa[1];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(table3, s1 & 0xFF);
		t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF);
		t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF);
		t4 = VIS_LD_U8_I(table0, s1 >> 24);
		t3 = VIS_LD_U8_I(table3, s0 & 0xFF);
		t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF);
		t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF);
		t0 = VIS_LD_U8_I(table0, s0 >> 24);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
	}
mlib_status
__mlib_VectorConvert_S32_S16_Mod(
	mlib_s32 *z,
	const mlib_s16 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_s16 *src = x;
	mlib_s32 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_s32 len_64, even_length, rest_64, length = n;
	mlib_d64 dd1, dd2, dd3, dd4;
	mlib_f32 two_16_ones = vis_to_float(0x10001);

	if (length < 16) {

		EXPAND(mlib_s16, mlib_s32);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;
	dd2 = ddsrc[0];

	if (!((mlib_addr)(src) & 7)) {

/*
 * Source vector is 64-bit aligned. We can process it without
 * vis_faligndata.
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			dd3 = (*ddsrc++);
/*
 * Now obtaining of the 4*32 - signed objects
 */
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3));
		}
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i < len_64; i += 2) {
			dd3 = (*ddsrc++);
			dd4 = (*ddsrc++);
/*
 * Now obtaining of the 4*32 - signed objects
 */
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3));
/*
 * Now obtaining of the 4*32 - signed objects
 */
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4));
		}
	} else {

/*
 * Source vector is not 64-bit aligned. Use vis_faligndata.
 * Peeling of 1 iteration.
 */

		i = 1;

		if ((len_64 & 1)) {
			i++;
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1);
			dd3 = vis_faligndata(dd1, dd2);
/*
 * Now obtaining of the 4*32 - signed objects
 */
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3));
		}

/*
 * Now loop with step == 2.
 */

#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= len_64; i += 2) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i);
			dd3 = vis_faligndata(dd1, dd2);
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_faligndata(dd1, dd2);
/*
 * Now obtaining of the 4*32 - signed objects
 */
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4));
			(*ddst++) =
				vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4));
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VectorConvert_S32_S8_Mod(
	mlib_s32 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *psrc = (mlib_s8 *)x;
	mlib_s32 *pdst = (mlib_s32 *)z;
	mlib_f32 fone = vis_to_float(0x10001);
	mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done =
		vis_to_double_dup(0x1000100);
	mlib_s32 i = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	if ((mlib_addr)pdst & 7) {
		(*pdst++) = (*psrc++);
		i = 1;
	}

	dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0);
	dsrc = vis_ld_d64_nf(dpsrc);
	vis_write_bmask(0x00012223, 0);

	if ((mlib_addr)psrc & 7) {
		dsrc1 = vis_ld_d64_nf(dpsrc + 1);
		dsrc = vis_faligndata(dsrc, dsrc1);
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3));
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc0 = dsrc1;
			dsrc1 = vis_ld_d64_nf(dpsrc + 2);
			dsrc = vis_faligndata(dsrc0, dsrc1);

			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	} else {
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_bshuffle(dst3, dst3);
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc = vis_ld_d64_nf(dpsrc + 1);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	}

	for (; i < n; i++)
		(*pdst++) = (*psrc++);

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VectorConvert_S16_S8_Mod(
	mlib_s16 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_s8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 fzero = vis_fzeros();
	mlib_s32 len_64, even_length, rest_64, length = n, off;
	mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7;

	if (length < 16) {
		EXPAND(mlib_s8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];
	off = (mlib_addr)src & 7;

	if (!off) {

/*
 * Both vectors are 64-bit aligned.
 */

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2),
				fzero), four_16_ones);
		}
	} else {

/*
 * Source vector is not 64-bit aligned.
 * Peeling of 1 iteration. Then loop with step==2.
 */

		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd4 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i <= len_64; i += 2) {
			dd0 = dd2;
			dd1 = vis_ld_d64_nf(ddsrc + i);
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_bshuffle(dd0, dd1);
			dd6 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			dd7 = vis_faligndata(dd6, dd6);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd6, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd7, four_16_ones);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VectorConvert_S32_U8_Mod(
	mlib_s32 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_u8 *psrc = (mlib_u8 *)x;
	mlib_s32 *pdst = (mlib_s32 *)z;
	mlib_f32 fzero = vis_fzero(), fone1 = vis_to_float(0x100), fone2 =
		vis_to_float(0x10001);
	mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3;
	mlib_s32 i = 0, off;

	if (n <= 0)
		return (MLIB_FAILURE);

	if ((mlib_addr)pdst & 7) {
		(*pdst++) = (*psrc++);
		i = 1;
	}

	dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0);
	dsrc = dpsrc[0];

	off = (mlib_addr)psrc & 7;

	if (off) {
		dsrc1 = dsrc;
		vis_alignaddr((void *)0, 7);
		vis_write_bmask(0x11111111 * off, 0x40516273);
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i <= (n - 8); i += 8) {
			dsrc0 = dsrc1;
			dsrc1 = vis_ld_d64_nf(dpsrc + 1);
			dsrc = vis_bshuffle(dsrc0, dsrc1);
			dst0 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2);
			dst1 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2);
			dsrc = vis_faligndata(dsrc, dsrc);
			dst2 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2);
			dst3 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	} else {
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fmul8x16al(vis_read_hi(dsrc), fone1);
			dst0 = vis_fpmerge(fzero, vis_read_hi(dst1));
			dst1 = vis_fpmerge(fzero, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst2 = vis_fmuld8ulx16(vis_read_hi(dst3), fone2);
			dst3 = vis_fmuld8ulx16(vis_read_lo(dst3), fone2);
			dsrc = vis_ld_d64_nf(dpsrc + 1);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	}

	for (; i < n; i++)
		(*pdst++) = (*psrc++);

	return (MLIB_SUCCESS);
}
void mlib_v_ImageLookUpSI_S16_U16_3_D1(const mlib_s16 *src,
                                       mlib_u16       *dst,
                                       mlib_s32       xsize,
                                       const mlib_u16 **table)
{
  mlib_s16 *sp;              /* pointer to source data */
  mlib_u16 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 acc0, acc1, acc2; /* destination data */
  mlib_s32 i;                /* loop variable */
  const mlib_u16 *tab0 = &table[0][32768];
  const mlib_u16 *tab1 = &table[1][32768];
  const mlib_u16 *tab2 = &table[2][32768];
  mlib_s32 s00, s01, s02, s03;

  sp   = (void *)src;
  dl   = dst;
  dp   = (mlib_d64 *) dl;

  vis_alignaddr((void *) 0, 6);

  i = 0;

  if (xsize >= 4) {

    s00 = (sp[0] << 1);
    s01 = (sp[1] << 1);
    s02 = (sp[2] << 1);
    s03 = (sp[3] << 1);
    sp += 4;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, sp+=4) {
      t3 = VIS_LD_U16_I(tab0, s01);
      t2 = VIS_LD_U16_I(tab2, s00);
      t1 = VIS_LD_U16_I(tab1, s00);
      t0 = VIS_LD_U16_I(tab0, s00);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      t3 = VIS_LD_U16_I(tab1, s02);
      t2 = VIS_LD_U16_I(tab0, s02);
      t1 = VIS_LD_U16_I(tab2, s01);
      t0 = VIS_LD_U16_I(tab1, s01);
      acc1 = vis_faligndata(t3, acc1);
      acc1 = vis_faligndata(t2, acc1);
      acc1 = vis_faligndata(t1, acc1);
      acc1 = vis_faligndata(t0, acc1);
      t3 = VIS_LD_U16_I(tab2, s03);
      t2 = VIS_LD_U16_I(tab1, s03);
      t1 = VIS_LD_U16_I(tab0, s03);
      t0 = VIS_LD_U16_I(tab2, s02);
      acc2 = vis_faligndata(t3, acc2);
      acc2 = vis_faligndata(t2, acc2);
      acc2 = vis_faligndata(t1, acc2);
      acc2 = vis_faligndata(t0, acc2);
      s00 = (sp[0] << 1);
      s01 = (sp[1] << 1);
      s02 = (sp[2] << 1);
      s03 = (sp[3] << 1);
      *dp++ = acc0;
      *dp++ = acc1;
      *dp++ = acc2;
    }

    t3 = VIS_LD_U16_I(tab0, s01);
    t2 = VIS_LD_U16_I(tab2, s00);
    t1 = VIS_LD_U16_I(tab1, s00);
    t0 = VIS_LD_U16_I(tab0, s00);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    t3 = VIS_LD_U16_I(tab1, s02);
    t2 = VIS_LD_U16_I(tab0, s02);
    t1 = VIS_LD_U16_I(tab2, s01);
    t0 = VIS_LD_U16_I(tab1, s01);
    acc1 = vis_faligndata(t3, acc1);
    acc1 = vis_faligndata(t2, acc1);
    acc1 = vis_faligndata(t1, acc1);
    acc1 = vis_faligndata(t0, acc1);
    t3 = VIS_LD_U16_I(tab2, s03);
    t2 = VIS_LD_U16_I(tab1, s03);
    t1 = VIS_LD_U16_I(tab0, s03);
    t0 = VIS_LD_U16_I(tab2, s02);
    acc2 = vis_faligndata(t3, acc2);
    acc2 = vis_faligndata(t2, acc2);
    acc2 = vis_faligndata(t1, acc2);
    acc2 = vis_faligndata(t0, acc2);
    *dp++ = acc0;
    *dp++ = acc1;
    *dp++ = acc2;
    i += 4;
  }

  dl = (mlib_u16*)dp;

#pragma pipeloop(0)
  for (; i < xsize; i++) {
    s00 = sp[0];
    dl[0] = tab0[s00];
    dl[1] = tab1[s00];
    dl[2] = tab2[s00];
    dl += 3; sp ++;
  }
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src,
                                           mlib_u8        *dst,
                                           mlib_s32       xsize,
                                           const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_u16 *dl;                        /* pointer to start of destination */
  mlib_u16 *dend;                      /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = (mlib_u16 *) dst;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab1, s3);
      t6 = VIS_LD_U8_I(tab0, s3);
      t5 = VIS_LD_U8_I(tab1, s2);
      t4 = VIS_LD_U8_I(tab0, s2);
      t3 = VIS_LD_U8_I(tab1, s1);
      t2 = VIS_LD_U8_I(tab0, s1);
      t1 = VIS_LD_U8_I(tab1, s0);
      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab1, s3);
    t6 = VIS_LD_U8_I(tab0, s3);
    t5 = VIS_LD_U8_I(tab1, s2);
    t4 = VIS_LD_U8_I(tab0, s2);
    t3 = VIS_LD_U8_I(tab1, s1);
    t2 = VIS_LD_U8_I(tab0, s1);
    t1 = VIS_LD_U8_I(tab1, s0);
    t0 = VIS_LD_U8_I(tab0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16 *) dend - (mlib_u16 *) dp;
    sp += num;
    num++;
#pragma pipeloop(0)
    for (i = 0; i < num; i++) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t0, acc);

      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc, dp, emask);
  }
}
void mlib_v_ImageLookUpSI_S16_U16_2_DstA8D1(const mlib_s16 *src,
                                            mlib_u16       *dst,
                                            mlib_s32       xsize,
                                            const mlib_u16 **table)
{
  mlib_s16 *sp;              /* pointer to source data */
  mlib_s32 s0, s1;           /* source data */
  mlib_u16 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;       /* destination data */
  mlib_d64 t3, acc;          /* destination data */
  mlib_s32 i;                /* loop variable */
  const mlib_u16 *tab0 = &table[0][32768];
  const mlib_u16 *tab1 = &table[1][32768];

  sp   = (void *)src;
  dl   = dst;
  dp   = (mlib_d64 *) dl;

  vis_alignaddr((void *) 0, 6);

  if (xsize >= 2) {

    s0 = (sp[0] << 1);
    s1 = (sp[1] << 1);
    sp += 2;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
      t3 = VIS_LD_U16_I(tab1, s1);
      t2 = VIS_LD_U16_I(tab0, s1);
      t1 = VIS_LD_U16_I(tab1, s0);
      t0 = VIS_LD_U16_I(tab0, s0);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = (sp[0] << 1);
      s1 = (sp[1] << 1);
      *dp++ = acc;
    }

    t3 = VIS_LD_U16_I(tab1, s1);
    t2 = VIS_LD_U16_I(tab0, s1);
    t1 = VIS_LD_U16_I(tab1, s0);
    t0 = VIS_LD_U16_I(tab0, s0);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((xsize & 1) != 0) {
    s0 = (sp[0] << 1);
    t1 = VIS_LD_U16_I(tab1, s0);
    t0 = VIS_LD_U16_I(tab0, s0);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *(mlib_f32*)dp = vis_read_hi(acc);
  }
}
void
mlib_v_ImageLookUpSI_S32_S16_2_D1(
    const mlib_s32 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 **table)
{
    /* pointer to source data */
    mlib_s32 *sp;

    /* source data */
    mlib_s32 s0, s1, s2;

    /* pointer to start of destination */
    mlib_s16 *dl;

    /* aligned pointer to destination */
    mlib_d64 *dp;

    /* destination data */
    mlib_d64 t0, t1, t2;

    /* destination data */
    mlib_d64 t3, acc;

    /* loop variable */
    mlib_s32 i;
    const mlib_s16 *tab0 = &table[0][(mlib_u32)2147483648u];
    const mlib_s16 *tab1 = &table[1][(mlib_u32)2147483648u];

    sp = (void *)src;
    dl = dst;

    vis_alignaddr((void *)0, 6);

    s0 = (*sp++);
    (*dl++) = tab0[s0];
    dp = (mlib_d64 *)dl;
    xsize--;

    if (xsize >= 2) {

        s1 = sp[0];
        s2 = sp[1];
        sp += 2;

#pragma pipeloop(0)
        for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
            t3 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s2));
            t2 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s1));
            t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1));
            t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0));
            acc = vis_faligndata(t3, acc);
            acc = vis_faligndata(t2, acc);
            acc = vis_faligndata(t1, acc);
            acc = vis_faligndata(t0, acc);
            s0 = s2;
            s1 = sp[0];
            s2 = sp[1];
            (*dp++) = acc;
        }

        t3 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s2));
        t2 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s1));
        t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1));
        t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0));
        acc = vis_faligndata(t3, acc);
        acc = vis_faligndata(t2, acc);
        acc = vis_faligndata(t1, acc);
        acc = vis_faligndata(t0, acc);
        s0 = s2;
        (*dp++) = acc;
    }

    dl = (mlib_s16 *)dp;

    if ((xsize & 1) != 0) {
        s1 = sp[0];
        t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1));
        t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0));
        acc = vis_faligndata(t1, acc);
        acc = vis_faligndata(t0, acc);
        *(mlib_f32 *)dp = vis_read_hi(acc);
        s0 = s1;
        dl += 2;
    }

    *dl = tab1[s0];
}
void mlib_v_ImageLookUpSI_U16_U8_4_DstOff3_D1(const mlib_u16 *src,
                                              mlib_u8        *dst,
                                              mlib_s32       xsize,
                                              const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2;                 /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 i;                          /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];
  const mlib_u8 *tab2 = &table[2][0];
  const mlib_u8 *tab3 = &table[3][0];

  sp = (void *)src;
  dl = dst;
  dp = (mlib_d64 *) dl;

  vis_alignaddr((void *)0, 7);

  s0 = *sp++;

  if (xsize >= 2) {

    s1 = sp[0];
    s2 = sp[1];
    sp += 2;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
      t7 = VIS_LD_U8_I(tab2, s2);
      t6 = VIS_LD_U8_I(tab1, s2);
      t5 = VIS_LD_U8_I(tab0, s2);
      t4 = VIS_LD_U8_I(tab3, s1);
      t3 = VIS_LD_U8_I(tab2, s1);
      t2 = VIS_LD_U8_I(tab1, s1);
      t1 = VIS_LD_U8_I(tab0, s1);
      t0 = VIS_LD_U8_I(tab3, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = s2;
      s1 = sp[0];
      s2 = sp[1];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab2, s2);
    t6 = VIS_LD_U8_I(tab1, s2);
    t5 = VIS_LD_U8_I(tab0, s2);
    t4 = VIS_LD_U8_I(tab3, s1);
    t3 = VIS_LD_U8_I(tab2, s1);
    t2 = VIS_LD_U8_I(tab1, s1);
    t1 = VIS_LD_U8_I(tab0, s1);
    t0 = VIS_LD_U8_I(tab3, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    s0 = s2;
    *dp++ = acc;
  }

  dl = (mlib_u8 *) dp;

  if ((xsize & 1) != 0) {
    s1 = sp[0];
    t7 = VIS_LD_U8_I(tab2, s1);
    t6 = VIS_LD_U8_I(tab1, s1);
    t5 = VIS_LD_U8_I(tab0, s1);
    t4 = VIS_LD_U8_I(tab3, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    *(mlib_f32 *) dl = vis_read_hi(acc);
    dl += 4;
    s0 = s1;
  }

  dl[0] = tab3[s0];
}
mlib_status
__mlib_VectorConjRev_S16C_S16C_Sat(
	mlib_s16 *zz,
	const mlib_s16 *xx,
	mlib_s32 n)
{
	mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz;
	mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n];
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo =
		vis_to_double(0, 2);
	mlib_f32 f_two = vis_to_float(0x20002);
	mlib_s16 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;

	CHECK(x, z);

	if ((n < 16)) {
		CONJREVC(mlib_s16,
			MLIB_S16_MAX,
			MLIB_S16_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S16_MIN)
			*--dst = MLIB_S16_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(15 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 2);
			d_rest = vis_to_double((re_part << 16), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_16(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}

			vis_write_gsr(2);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_16(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S16_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S16_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VectorConjRev_S8C_S8C_Sat(
	mlib_s8 *zz,
	const mlib_s8 *xx,
	mlib_s32 n)
{
	const mlib_s8 *x = xx;
	mlib_s8 *z = zz;
	mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n);
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dcntr0 = vis_to_double_dup(0x00800080);
	mlib_d64 dxor0 = vis_to_double_dup(0x007f007f);
	mlib_d64 done = vis_to_double_dup(1);
	mlib_s8 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;
	mlib_f32 f_null = vis_to_float(0);

	CHECK(x, z);

	if (n < 8) {
		CONJREVC(mlib_s8,
			MLIB_S8_MAX,
			MLIB_S8_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S8_MIN)
			*--dst = MLIB_S8_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(7 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 1);
			d_rest = vis_to_double((re_part << 24), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_8(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}

			vis_write_gsr(1);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_8(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S8_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S8_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
void
mlib_v_ImageColorRGB2Mono_U8_D1(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 dsize,
    const mlib_d64 *weight)
{
    mlib_u8 *dst_end;
    mlib_d64 dd, d0, d1, d2, d3;
    mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt;
    mlib_d64 *src_all, *dp;
    mlib_f32 d32, e32, alpha, gamma, beta;
    mlib_d64 sd0, sd1, sd2;
    mlib_s32 i, emask;
    mlib_s32 off;
    mlib_s32 mask0 = 0x0369147a;
    mlib_s32 mask1 = 0x258b258b;
    mlib_s32 mask2 = 0x47ad58be;
    mlib_s32 mask3 = 0x69cf69cf;

    /* prepare the weight */
    alpha = vis_to_float(weight[0] * 8192);
    beta = vis_to_float(weight[1] * 8192);
    gamma = vis_to_float(weight[2] * 8192);
    vis_write_gsr(2 << 3);

    dp = (mlib_d64 *)((mlib_addr)dst & (~7));
    off = (mlib_addr)dp - (mlib_addr)dst;

    dst_end = dst + (dsize - 1);
    emask = vis_edge8(dst, dst_end);
    src_all = vis_alignaddr((void *)src, (3 * off));

    d0 = (*src_all++);
    d1 = (*src_all++);
    d2 = (*src_all++);
    d3 = (*src_all++);

    sd0 = vis_faligndata(d0, d1);
    sd1 = vis_faligndata(d1, d2);
    sd2 = vis_faligndata(d2, d3);

    CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
    CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
    vis_pst_8(dd, dp, emask);
    dp++;

#pragma pipeloop(0)
    for (i = 8 + off; i <= (dsize - 8); i += 8) {
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);

        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        (*dp++) = dd;
    }

    if ((mlib_addr)dp <= (mlib_addr)dst_end) {

        emask = vis_edge8(dp, dst_end);
        d0 = d3;
        d1 = (*src_all++);
        d2 = (*src_all++);
        d3 = (*src_all++);
        sd0 = vis_faligndata(d0, d1);
        sd1 = vis_faligndata(d1, d2);
        sd2 = vis_faligndata(d2, d3);

        CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1);
        CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd);
        vis_pst_8(dd, dp, emask);
    }
}
static mlib_status
mlib_v_VideoColorYUV2RGB444_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 yuv_stride)
{
/* all. pointer to y, u, v */
	mlib_d64 *spy, *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2;

/* loop variable */
	mlib_s32 i, j;
	mlib_d64 *buf, BUFF[16 * 1024];
	mlib_u8 *tmp, *dp;

	if (width * 3 > 16 * 1024) {
		tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);

		if (tmp == NULL)
			return (MLIB_FAILURE);
		buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
	} else {
		buf = (mlib_d64 *)BUFF;
	}

	dp = (mlib_u8 *)buf;
	ddp = (mlib_d64 *)dp;

	for (j = 0; j < height; j++) {

		dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0);
		du0 = (*dfu++);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

		dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0);
		dv0 = (*dfv++);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		spy = (mlib_d64 *)vis_alignaddr((void *)y, 0);
		dy0 = (*spy++);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

		vis_alignaddr((void *)u, 0);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

		vis_alignaddr((void *)v, 0);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		g_hi = vis_fpadd16(g_hi, y_11644_hi);

		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
		g_lo = vis_fpadd16(g_lo, y_11644_lo);

		green = vis_fpack16_pair(g_hi, g_lo);
		b_hi = vis_fpadd16(b_hi, y_11644_hi);
		b_lo = vis_fpadd16(b_lo, y_11644_lo);

		blue = vis_fpack16_pair(b_hi, b_lo);
		r_hi = vis_fpadd16(r_hi, y_11644_hi);
		r_lo = vis_fpadd16(r_lo, y_11644_lo);

		red = vis_fpack16_pair(r_hi, r_lo);

		vis_alignaddr((void *)y, 0);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd0 = vis_bshuffle(red, green);
			vis_write_bmask(0x03B04C05, 0);
			dd1 = vis_bshuffle(red, green);
			vis_write_bmask(0xD06E07F0, 0);
			dd2 = vis_bshuffle(red, green);
			vis_write_bmask(0x01834967, 0);
			ddp[0] = vis_bshuffle(dd0, blue);
			vis_write_bmask(0xA12B45C7, 0);
			ddp[1] = vis_bshuffle(dd1, blue);
			vis_write_bmask(0x0D23E56F, 0);
			ddp[2] = vis_bshuffle(dd2, blue);

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);
			vis_alignaddr((void *)u, 0);
			du1 = vis_ld_d64_nf(dfu); dfu++;
			du = vis_faligndata(du0, du1);
			du0 = du1;

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);
			vis_alignaddr((void *)v, 0);
			dv1 = vis_ld_d64_nf(dfv); dfv++;
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			g_hi = vis_fpadd16(g_hi, y_11644_hi);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			g_lo = vis_fpadd16(g_lo, y_11644_lo);

			green = vis_fpack16_pair(g_hi, g_lo);
			b_hi = vis_fpadd16(b_hi, y_11644_hi);
			b_lo = vis_fpadd16(b_lo, y_11644_lo);

			blue = vis_fpack16_pair(b_hi, b_lo);
			r_hi = vis_fpadd16(r_hi, y_11644_hi);
			r_lo = vis_fpadd16(r_lo, y_11644_lo);

			red = vis_fpack16_pair(r_hi, r_lo);

			vis_alignaddr((void *)y, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			ddp += 3;
		}

		dp = (mlib_u8 *)ddp;

		vis_alignaddr((void *)(width - i), 0);
		blue = vis_faligndata(blue, blue);
		green = vis_faligndata(green, green);
		red = vis_faligndata(red, red);
		dp += ((width - i - 1) * 3);

		vis_alignaddr((void *)spy, 7);
		for (; i < width; i++) {
			STORE_PIXEL(0, 1, 2);
			dp -= 3;
		}

		__mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3);

		rgb += rgb_stride;
		dp = (mlib_u8 *)buf;
		ddp = (mlib_d64 *)dp;
		y += yuv_stride;
		u += yuv_stride;
		v += yuv_stride;
	}

	if (width * 3 > 16 * 1024)
		__mlib_free(tmp);
	return (MLIB_SUCCESS);
}
void
mlib_v_ImageLookUpSI_U16_S16_2_D1(
    const mlib_u16 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 **table)
{
/* pointer to source data */
	mlib_u16 *sp;

/* source data */
	mlib_s32 s0, s1, s2;

/* pointer to start of destination */
	mlib_s16 *dl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, acc0;

/* loop variable */
	mlib_s32 i;
	const mlib_s16 *tab0 = &table[0][0];
	const mlib_s16 *tab1 = &table[1][0];

/* destination data */
	mlib_d64 acc1;

	sp = (void *)src;
	dl = dst;

	vis_alignaddr((void *)0, 6);

	s0 = (*sp++);
	(*dl++) = tab0[s0];
	dp = (mlib_d64 *)dl;
	xsize--;
	s0 <<= 1;

	if (xsize >= 2) {

		s1 = (sp[0] << 1);
		s2 = (sp[1] << 1);
		sp += 2;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
			t3 = VIS_LD_U16_I(tab0, s2);
			t2 = VIS_LD_U16_I(tab1, s1);
			t1 = VIS_LD_U16_I(tab0, s1);
			t0 = VIS_LD_U16_I(tab1, s0);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = s2;
			s1 = (sp[0] << 1);
			s2 = (sp[1] << 1);
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t3 = VIS_LD_U16_I(tab0, s2);
		t2 = VIS_LD_U16_I(tab1, s1);
		t1 = VIS_LD_U16_I(tab0, s1);
		t0 = VIS_LD_U16_I(tab1, s0);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		s0 = s2;
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	dl = (mlib_s16 *)dp;

	if ((xsize & 1) != 0) {
		s1 = (sp[0] << 1);
		t1 = VIS_LD_U16_I(tab0, s1);
		t0 = VIS_LD_U16_I(tab1, s0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		*(mlib_f32 *)dp = vis_read_hi(acc0);
		s0 = s1;
		dl += 2;
	}

	s0 >>= 1;
	*dl = tab1[s0];
}
mlib_status
__mlib_VectorConvert_S16_U8_Mod(
	mlib_s16 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_u8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_s32 len_64, even_length, rest_64, length = n;
	mlib_f32 fzero = vis_fzeros();
	mlib_d64 dd1, dd2, dd3, dd4;
	mlib_f32 fm = vis_to_float(0x100);

	if (length < 16) {
		EXPAND(mlib_u8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];

	if (!((mlib_addr)src & 7)) {

/*
 * Both vectors are 64-bit aligned. We can process without
 * vis_faligndata
 * Peeling the 1 iteration. Then loop with step==2.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
		}
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2));
		}
	} else {

/*
 * Source vector is not 64-bit aligned. Use vis_faligndata.
 * Peeling the 1 iteration. Then loop with step==2.
 */

		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd3 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3));
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
		}
#pragma pipeloop(0)
#pragma unroll(2)
		for (; i <= len_64; i += 2) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i);
			dd3 = vis_faligndata(dd1, dd2);
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_faligndata(dd1, dd2);
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3));
			(*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm);
			(*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4));
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoInterpX_U8_U8(
	mlib_u8 *curr_block,
	const mlib_u8 *ref_block,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 frame_stride,
	mlib_s32 field_stride)
{
	mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, *sd, *dd;
	mlib_d64 dzero = vis_fzero();
	const mlib_f32 fm1 = vis_to_float(0x100);
	mlib_f32 fzero = vis_read_hi(dzero);
	mlib_d64 rounder = vis_fone();
	mlib_s32 y;

	vis_write_gsr((6 << 3) + ((mlib_u32)ref_block & 7));
	dd = (mlib_d64 *)curr_block;
	sd = (mlib_d64 *)((mlib_addr)ref_block & ~7);

	if (width == 8) {
		y = height >> 2;

		if (((mlib_s32)(ref_block + 1) & 7)) {
			do {
				s0 = sd[0];
				s1 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d0 = vis_faligndata(s0, s1);
				s2 = sd[0];
				s3 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d1 = vis_faligndata(s2, s3);
				s4 = sd[0];
				s5 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d2 = vis_faligndata(s4, s5);
				s6 = sd[0];
				s7 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d3 = vis_faligndata(s6, s7);
				vis_alignaddr((void *)(ref_block + 1), 0);
				s0 = vis_faligndata(s0, s1);
				s1 = vis_faligndata(s2, s3);
				s2 = vis_faligndata(s4, s5);
				s3 = vis_faligndata(s6, s7);

				MLIB_V_VIDEOINTERP(d0, d0, s0);
				MLIB_V_VIDEOINTERP(d1, d1, s1);
				MLIB_V_VIDEOINTERP(d2, d2, s2);
				MLIB_V_VIDEOINTERP4(d3, d3, s3);

				*dd = d0;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d1;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d2;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d3;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

				vis_alignaddr((void *)ref_block, 0);
			} while (--y);
		} else {
			do {
				s0 = sd[0];
				s1 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d0 = vis_faligndata(s0, s1);
				s2 = sd[0];
				s3 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d1 = vis_faligndata(s2, s3);
				s4 = sd[0];
				s5 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d2 = vis_faligndata(s4, s5);
				s6 = sd[0];
				s7 = sd[1];
				sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride);
				d3 = vis_faligndata(s6, s7);

				MLIB_V_VIDEOINTERP4(d0, d0, s1);
				MLIB_V_VIDEOINTERP4(d1, d1, s3);
				MLIB_V_VIDEOINTERP4(d2, d2, s5);
				MLIB_V_VIDEOINTERP4(d3, d3, s7);

				*dd = d0;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d1;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d2;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);
				*dd = d3;
				dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride);

			} while (--y);
		}
	} else {
void
mlib_v_ImageLookUpSI_U16_S16_4_DstOff3_D1(
    const mlib_u16 *src,
    mlib_s16 *dst,
    mlib_s32 xsize,
    const mlib_s16 **table)
{
/* pointer to source data */
	mlib_u16 *sp;

/* source data */
	mlib_s32 s0, s1;

/* pointer to start of destination */
	mlib_s16 *dl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2, t3;

/* destination data */
	mlib_d64 acc0;

/* loop variable */
	mlib_s32 i;
	const mlib_s16 *tab0 = &table[0][0];
	const mlib_s16 *tab1 = &table[1][0];
	const mlib_s16 *tab2 = &table[2][0];
	const mlib_s16 *tab3 = &table[3][0];

/* destination data */
	mlib_d64 acc1;

	sp = (void *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;

	vis_alignaddr((void *)0, 6);

	s0 = ((*sp++)) << 1;

	if (xsize >= 1) {

		s1 = ((*sp++)) << 1;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 2; i++) {
			t3 = VIS_LD_U16_I(tab2, s1);
			t2 = VIS_LD_U16_I(tab1, s1);
			t1 = VIS_LD_U16_I(tab0, s1);
			t0 = VIS_LD_U16_I(tab3, s0);
			acc1 = vis_faligndata(t3, acc1);
			acc1 = vis_faligndata(t2, acc1);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = s1;
			s1 = ((*sp++)) << 1;
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t3 = VIS_LD_U16_I(tab2, s1);
		t2 = VIS_LD_U16_I(tab1, s1);
		t1 = VIS_LD_U16_I(tab0, s1);
		t0 = VIS_LD_U16_I(tab3, s0);
		acc1 = vis_faligndata(t3, acc1);
		acc1 = vis_faligndata(t2, acc1);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		s0 = s1;
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	dl = (mlib_s16 *)dp;
	s0 >>= 1;

	dl[0] = tab3[s0];
}