void
mlib_v_ImageLookUp_U8_U8_124_SrcOff0_D1(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 *table0,
    const mlib_u8 *table1,
    const mlib_u8 *table2,
    const mlib_u8 *table3)
{
/* aligned pointer to source data */
	mlib_u32 *sa;

/* pointer to source data */
	mlib_u8 *sp;

/* source data */
	mlib_u32 s0, s1;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0, acc1;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;

	sa = (mlib_u32 *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 8) {

		s0 = sa[0];
		s1 = sa[1];
		sa += 2;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sa += 2) {
			t7 = VIS_LD_U8_I(table3, s1 & 0xFF);
			t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF);
			t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF);
			t4 = VIS_LD_U8_I(table0, s1 >> 24);
			t3 = VIS_LD_U8_I(table3, s0 & 0xFF);
			t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF);
			t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF);
			t0 = VIS_LD_U8_I(table0, s0 >> 24);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sa[0];
			s1 = sa[1];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(table3, s1 & 0xFF);
		t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF);
		t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF);
		t4 = VIS_LD_U8_I(table0, s1 >> 24);
		t3 = VIS_LD_U8_I(table3, s0 & 0xFF);
		t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF);
		t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF);
		t0 = VIS_LD_U8_I(table0, s0 >> 24);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
	}
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src,
                                           mlib_u8        *dst,
                                           mlib_s32       xsize,
                                           const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_u16 *dl;                        /* pointer to start of destination */
  mlib_u16 *dend;                      /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];

  sp = (void *)src;
  dl = (mlib_u16 *) dst;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 4) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    sp += 4;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
      t7 = VIS_LD_U8_I(tab1, s3);
      t6 = VIS_LD_U8_I(tab0, s3);
      t5 = VIS_LD_U8_I(tab1, s2);
      t4 = VIS_LD_U8_I(tab0, s2);
      t3 = VIS_LD_U8_I(tab1, s1);
      t2 = VIS_LD_U8_I(tab0, s1);
      t1 = VIS_LD_U8_I(tab1, s0);
      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab1, s3);
    t6 = VIS_LD_U8_I(tab0, s3);
    t5 = VIS_LD_U8_I(tab1, s2);
    t4 = VIS_LD_U8_I(tab0, s2);
    t3 = VIS_LD_U8_I(tab1, s1);
    t2 = VIS_LD_U8_I(tab0, s1);
    t1 = VIS_LD_U8_I(tab1, s0);
    t0 = VIS_LD_U8_I(tab0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_u16 *) dend - (mlib_u16 *) dp;
    sp += num;
    num++;
#pragma pipeloop(0)
    for (i = 0; i < num; i++) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(tab1, s0);
      acc = vis_faligndata(t0, acc);

      t0 = VIS_LD_U8_I(tab0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge16(dp, dend);
    vis_pst_16(acc, dp, emask);
  }
}
void mlib_v_ImageLookUpSI_U16_U8_4_DstOff3_D1(const mlib_u16 *src,
                                              mlib_u8        *dst,
                                              mlib_s32       xsize,
                                              const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2;                 /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 i;                          /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];
  const mlib_u8 *tab2 = &table[2][0];
  const mlib_u8 *tab3 = &table[3][0];

  sp = (void *)src;
  dl = dst;
  dp = (mlib_d64 *) dl;

  vis_alignaddr((void *)0, 7);

  s0 = *sp++;

  if (xsize >= 2) {

    s1 = sp[0];
    s2 = sp[1];
    sp += 2;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
      t7 = VIS_LD_U8_I(tab2, s2);
      t6 = VIS_LD_U8_I(tab1, s2);
      t5 = VIS_LD_U8_I(tab0, s2);
      t4 = VIS_LD_U8_I(tab3, s1);
      t3 = VIS_LD_U8_I(tab2, s1);
      t2 = VIS_LD_U8_I(tab1, s1);
      t1 = VIS_LD_U8_I(tab0, s1);
      t0 = VIS_LD_U8_I(tab3, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      s0 = s2;
      s1 = sp[0];
      s2 = sp[1];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(tab2, s2);
    t6 = VIS_LD_U8_I(tab1, s2);
    t5 = VIS_LD_U8_I(tab0, s2);
    t4 = VIS_LD_U8_I(tab3, s1);
    t3 = VIS_LD_U8_I(tab2, s1);
    t2 = VIS_LD_U8_I(tab1, s1);
    t1 = VIS_LD_U8_I(tab0, s1);
    t0 = VIS_LD_U8_I(tab3, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    s0 = s2;
    *dp++ = acc;
  }

  dl = (mlib_u8 *) dp;

  if ((xsize & 1) != 0) {
    s1 = sp[0];
    t7 = VIS_LD_U8_I(tab2, s1);
    t6 = VIS_LD_U8_I(tab1, s1);
    t5 = VIS_LD_U8_I(tab0, s1);
    t4 = VIS_LD_U8_I(tab3, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    *(mlib_f32 *) dl = vis_read_hi(acc);
    dl += 4;
    s0 = s1;
  }

  dl[0] = tab3[s0];
}
void mlib_v_ImageLookUpSI_U16_U8_3_D1(const mlib_u16 *src,
                                      mlib_u8        *dst,
                                      mlib_s32       xsize,
                                      const mlib_u8  **table)
{
  mlib_u16 *sp;                        /* pointer to source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7;                     /* destination data */
  mlib_d64 acc0, acc1, acc2;           /* destination data */
  mlib_s32 i;                          /* loop variable */
  const mlib_u8 *tab0 = &table[0][0];
  const mlib_u8 *tab1 = &table[1][0];
  const mlib_u8 *tab2 = &table[2][0];
  mlib_s32 s00, s01, s02, s03;
  mlib_s32 s10, s11, s12, s13;

  sp = (void *)src;
  dl = dst;
  dp = (mlib_d64 *) dl;

  vis_alignaddr((void *)0, 7);

  i = 0;

  if (xsize >= 8) {

    s00 = sp[0];
    s01 = sp[1];
    s02 = sp[2];
    s03 = sp[3];
    s10 = sp[4];
    s11 = sp[5];
    s12 = sp[6];
    s13 = sp[7];
    sp += 8;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
      t7 = VIS_LD_U8_I(tab1, s02);
      t6 = VIS_LD_U8_I(tab0, s02);
      t5 = VIS_LD_U8_I(tab2, s01);
      t4 = VIS_LD_U8_I(tab1, s01);
      t3 = VIS_LD_U8_I(tab0, s01);
      t2 = VIS_LD_U8_I(tab2, s00);
      t1 = VIS_LD_U8_I(tab1, s00);
      t0 = VIS_LD_U8_I(tab0, s00);
      acc0 = vis_faligndata(t7, acc0);
      acc0 = vis_faligndata(t6, acc0);
      acc0 = vis_faligndata(t5, acc0);
      acc0 = vis_faligndata(t4, acc0);
      acc0 = vis_faligndata(t3, acc0);
      acc0 = vis_faligndata(t2, acc0);
      acc0 = vis_faligndata(t1, acc0);
      acc0 = vis_faligndata(t0, acc0);
      t7 = VIS_LD_U8_I(tab0, s11);
      t6 = VIS_LD_U8_I(tab2, s10);
      t5 = VIS_LD_U8_I(tab1, s10);
      t4 = VIS_LD_U8_I(tab0, s10);
      t3 = VIS_LD_U8_I(tab2, s03);
      t2 = VIS_LD_U8_I(tab1, s03);
      t1 = VIS_LD_U8_I(tab0, s03);
      t0 = VIS_LD_U8_I(tab2, s02);
      acc1 = vis_faligndata(t7, acc1);
      acc1 = vis_faligndata(t6, acc1);
      acc1 = vis_faligndata(t5, acc1);
      acc1 = vis_faligndata(t4, acc1);
      acc1 = vis_faligndata(t3, acc1);
      acc1 = vis_faligndata(t2, acc1);
      acc1 = vis_faligndata(t1, acc1);
      acc1 = vis_faligndata(t0, acc1);
      t7 = VIS_LD_U8_I(tab2, s13);
      t6 = VIS_LD_U8_I(tab1, s13);
      t5 = VIS_LD_U8_I(tab0, s13);
      t4 = VIS_LD_U8_I(tab2, s12);
      t3 = VIS_LD_U8_I(tab1, s12);
      t2 = VIS_LD_U8_I(tab0, s12);
      t1 = VIS_LD_U8_I(tab2, s11);
      t0 = VIS_LD_U8_I(tab1, s11);
      acc2 = vis_faligndata(t7, acc2);
      acc2 = vis_faligndata(t6, acc2);
      acc2 = vis_faligndata(t5, acc2);
      acc2 = vis_faligndata(t4, acc2);
      acc2 = vis_faligndata(t3, acc2);
      acc2 = vis_faligndata(t2, acc2);
      acc2 = vis_faligndata(t1, acc2);
      acc2 = vis_faligndata(t0, acc2);
      s00 = sp[0];
      s01 = sp[1];
      s02 = sp[2];
      s03 = sp[3];
      s10 = sp[4];
      s11 = sp[5];
      s12 = sp[6];
      s13 = sp[7];
      *dp++ = acc0;
      *dp++ = acc1;
      *dp++ = acc2;
    }

    t7 = VIS_LD_U8_I(tab1, s02);
    t6 = VIS_LD_U8_I(tab0, s02);
    t5 = VIS_LD_U8_I(tab2, s01);
    t4 = VIS_LD_U8_I(tab1, s01);
    t3 = VIS_LD_U8_I(tab0, s01);
    t2 = VIS_LD_U8_I(tab2, s00);
    t1 = VIS_LD_U8_I(tab1, s00);
    t0 = VIS_LD_U8_I(tab0, s00);
    acc0 = vis_faligndata(t7, acc0);
    acc0 = vis_faligndata(t6, acc0);
    acc0 = vis_faligndata(t5, acc0);
    acc0 = vis_faligndata(t4, acc0);
    acc0 = vis_faligndata(t3, acc0);
    acc0 = vis_faligndata(t2, acc0);
    acc0 = vis_faligndata(t1, acc0);
    acc0 = vis_faligndata(t0, acc0);
    t7 = VIS_LD_U8_I(tab0, s11);
    t6 = VIS_LD_U8_I(tab2, s10);
    t5 = VIS_LD_U8_I(tab1, s10);
    t4 = VIS_LD_U8_I(tab0, s10);
    t3 = VIS_LD_U8_I(tab2, s03);
    t2 = VIS_LD_U8_I(tab1, s03);
    t1 = VIS_LD_U8_I(tab0, s03);
    t0 = VIS_LD_U8_I(tab2, s02);
    acc1 = vis_faligndata(t7, acc1);
    acc1 = vis_faligndata(t6, acc1);
    acc1 = vis_faligndata(t5, acc1);
    acc1 = vis_faligndata(t4, acc1);
    acc1 = vis_faligndata(t3, acc1);
    acc1 = vis_faligndata(t2, acc1);
    acc1 = vis_faligndata(t1, acc1);
    acc1 = vis_faligndata(t0, acc1);
    t7 = VIS_LD_U8_I(tab2, s13);
    t6 = VIS_LD_U8_I(tab1, s13);
    t5 = VIS_LD_U8_I(tab0, s13);
    t4 = VIS_LD_U8_I(tab2, s12);
    t3 = VIS_LD_U8_I(tab1, s12);
    t2 = VIS_LD_U8_I(tab0, s12);
    t1 = VIS_LD_U8_I(tab2, s11);
    t0 = VIS_LD_U8_I(tab1, s11);
    acc2 = vis_faligndata(t7, acc2);
    acc2 = vis_faligndata(t6, acc2);
    acc2 = vis_faligndata(t5, acc2);
    acc2 = vis_faligndata(t4, acc2);
    acc2 = vis_faligndata(t3, acc2);
    acc2 = vis_faligndata(t2, acc2);
    acc2 = vis_faligndata(t1, acc2);
    acc2 = vis_faligndata(t0, acc2);
    *dp++ = acc0;
    *dp++ = acc1;
    *dp++ = acc2;
    i += 8;
  }

  dl = (mlib_u8 *) dp;

#pragma pipeloop(0)
  for (; i < xsize; i++) {
    s00 = sp[0];
    dl[0] = tab0[s00];
    dl[1] = tab1[s00];
    dl[2] = tab2[s00];
    dl += 3;
    sp++;
  }
}
void
mlib_v_ImageLookUp_S16_U8_124_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 *table0,
    const mlib_u8 *table1,
    const mlib_u8 *table2,
    const mlib_u8 *table3)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3;

/* source data */
	mlib_s32 s4, s5, s6, s7;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;

/* destination data */
	mlib_d64 acc1;

	dl = dst;
	dp = (mlib_d64 *)dl;
	dend = dl + xsize - 1;
	sp = (void *)src;

	vis_alignaddr((void *)0, 7);

	if (xsize >= 8) {

		s0 = sp[0];
		s1 = sp[1];
		s2 = sp[2];
		s3 = sp[3];
		s4 = sp[4];
		s5 = sp[5];
		s6 = sp[6];
		s7 = sp[7];
		sp += 8;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
			t7 = VIS_LD_U8_I(table3, s7);
			t6 = VIS_LD_U8_I(table2, s6);
			t5 = VIS_LD_U8_I(table1, s5);
			t4 = VIS_LD_U8_I(table0, s4);
			t3 = VIS_LD_U8_I(table3, s3);
			t2 = VIS_LD_U8_I(table2, s2);
			t1 = VIS_LD_U8_I(table1, s1);
			t0 = VIS_LD_U8_I(table0, s0);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = sp[0];
			s1 = sp[1];
			s2 = sp[2];
			s3 = sp[3];
			s4 = sp[4];
			s5 = sp[5];
			s6 = sp[6];
			s7 = sp[7];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(table3, s7);
		t6 = VIS_LD_U8_I(table2, s6);
		t5 = VIS_LD_U8_I(table1, s5);
		t4 = VIS_LD_U8_I(table0, s4);
		t3 = VIS_LD_U8_I(table3, s3);
		t2 = VIS_LD_U8_I(table2, s2);
		t1 = VIS_LD_U8_I(table1, s1);
		t0 = VIS_LD_U8_I(table0, s0);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	if ((mlib_addr)dp <= (mlib_addr)dend) {

		num = (mlib_addr)dend - (mlib_addr)dp;
		sp += num;
		num++;

		if ((num & 3) == 1) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num--;
		} else if ((num & 3) == 2) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 2;
		} else if ((num & 3) == 3) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
			num -= 3;
		}

		if (num != 0) {
			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table3, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table2, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table1, s0);
			acc0 = vis_faligndata(t0, acc0);

			s0 = (mlib_s32)*sp;
			sp--;

			t0 = VIS_LD_U8_I(table0, s0);
			acc0 = vis_faligndata(t0, acc0);
		}

		emask = vis_edge8(dp, dend);
		vis_pst_8(acc0, dp, emask);
	}
}
void mlib_v_ImageLookUp_S16_U8_3_D1(const mlib_s16 *src,
                                    mlib_u8        *dst,
                                    mlib_s32       xsize,
                                    const mlib_u8  *table0,
                                    const mlib_u8  *table1,
                                    const mlib_u8  *table2)
{
  mlib_s16 *sp;                        /* pointer to source data */
  mlib_s32 s0, s1, s2, s3;             /* source data */
  mlib_s32 s4, s5, s6, s7;             /* source data */
  mlib_u8 *dl;                         /* pointer to start of destination */
  mlib_u8 *dend;                       /* pointer to end of destination */
  mlib_d64 *dp;                        /* aligned pointer to destination */
  mlib_d64 t0, t1, t2;                 /* destination data */
  mlib_d64 t3, t4, t5;                 /* destination data */
  mlib_d64 t6, t7, acc;                /* destination data */
  mlib_s32 emask;                      /* edge mask */
  mlib_s32 i, num;                     /* loop variable */
  const mlib_u8 *table;

  dl = dst;
  sp = (void *)src;
  dp = (mlib_d64 *) dl;
  dend = dl + xsize - 1;

  vis_alignaddr((void *)0, 7);

  if (xsize >= 8) {

    s0 = sp[0];
    s1 = sp[1];
    s2 = sp[2];
    s3 = sp[3];
    s4 = sp[4];
    s5 = sp[5];
    s6 = sp[6];
    s7 = sp[7];
    sp += 8;

#pragma pipeloop(0)
    for (i = 0; i <= xsize - 16; i += 8, sp += 8) {
      t7 = VIS_LD_U8_I(table1, s7);
      t6 = VIS_LD_U8_I(table0, s6);
      t5 = VIS_LD_U8_I(table2, s5);
      t4 = VIS_LD_U8_I(table1, s4);
      t3 = VIS_LD_U8_I(table0, s3);
      t2 = VIS_LD_U8_I(table2, s2);
      t1 = VIS_LD_U8_I(table1, s1);
      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t7, acc);
      acc = vis_faligndata(t6, acc);
      acc = vis_faligndata(t5, acc);
      acc = vis_faligndata(t4, acc);
      acc = vis_faligndata(t3, acc);
      acc = vis_faligndata(t2, acc);
      acc = vis_faligndata(t1, acc);
      acc = vis_faligndata(t0, acc);
      table = table0;
      table0 = table2;
      table2 = table1;
      table1 = table;
      s0 = sp[0];
      s1 = sp[1];
      s2 = sp[2];
      s3 = sp[3];
      s4 = sp[4];
      s5 = sp[5];
      s6 = sp[6];
      s7 = sp[7];
      *dp++ = acc;
    }

    t7 = VIS_LD_U8_I(table1, s7);
    t6 = VIS_LD_U8_I(table0, s6);
    t5 = VIS_LD_U8_I(table2, s5);
    t4 = VIS_LD_U8_I(table1, s4);
    t3 = VIS_LD_U8_I(table0, s3);
    t2 = VIS_LD_U8_I(table2, s2);
    t1 = VIS_LD_U8_I(table1, s1);
    t0 = VIS_LD_U8_I(table0, s0);
    acc = vis_faligndata(t7, acc);
    acc = vis_faligndata(t6, acc);
    acc = vis_faligndata(t5, acc);
    acc = vis_faligndata(t4, acc);
    acc = vis_faligndata(t3, acc);
    acc = vis_faligndata(t2, acc);
    acc = vis_faligndata(t1, acc);
    acc = vis_faligndata(t0, acc);
    table = table0;
    table0 = table2;
    table2 = table1;
    table1 = table;
    *dp++ = acc;
  }

  if ((mlib_addr) dp <= (mlib_addr) dend) {

    num = (mlib_addr) dend - (mlib_addr) dp;
    sp += num;
    num++;
    i = num - 3 * (num / 3);

    if (i == 2) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num -= 2;
    }
    else if (i == 1) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
      num--;
    }

#pragma pipeloop(0)
    for (i = 0; i < num; i += 3) {
      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table2, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table1, s0);
      acc = vis_faligndata(t0, acc);

      s0 = (mlib_s32) * sp;
      sp--;

      t0 = VIS_LD_U8_I(table0, s0);
      acc = vis_faligndata(t0, acc);
    }

    emask = vis_edge8(dp, dend);
    vis_pst_8(acc, dp, emask);
  }
}
void
mlib_v_ImageLookUpSI_S16_U8_2_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 **table)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2, s3, s4;

/* pointer to start of destination */
	mlib_u8 *dl;

/* pointer to end of destination */
	mlib_u8 *dend;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc;

/* edge mask */
	mlib_s32 emask;

/* loop variable */
	mlib_s32 i, num;
	const mlib_u8 *tab0 = &table[0][32768];
	const mlib_u8 *tab1 = &table[1][32768];

	sp = (void *)src;
	dl = dst;

	dend = dl + 2 * xsize - 1;

	vis_alignaddr((void *)0, 7);

	s0 = (*sp++);
	(*dl++) = tab0[s0];
	dp = (mlib_d64 *)dl;
	xsize--;

	if (xsize >= 4) {

		s1 = sp[0];
		s2 = sp[1];
		s3 = sp[2];
		s4 = sp[3];
		sp += 4;

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
			t7 = VIS_LD_U8_I(tab0, s4);
			t6 = VIS_LD_U8_I(tab1, s3);
			t5 = VIS_LD_U8_I(tab0, s3);
			t4 = VIS_LD_U8_I(tab1, s2);
			t3 = VIS_LD_U8_I(tab0, s2);
			t2 = VIS_LD_U8_I(tab1, s1);
			t1 = VIS_LD_U8_I(tab0, s1);
			t0 = VIS_LD_U8_I(tab1, s0);
			acc = vis_faligndata(t7, acc);
			acc = vis_faligndata(t6, acc);
			acc = vis_faligndata(t5, acc);
			acc = vis_faligndata(t4, acc);
			acc = vis_faligndata(t3, acc);
			acc = vis_faligndata(t2, acc);
			acc = vis_faligndata(t1, acc);
			acc = vis_faligndata(t0, acc);
			s0 = s4;
			s1 = sp[0];
			s2 = sp[1];
			s3 = sp[2];
			s4 = sp[3];
			(*dp++) = acc;
		}

		t7 = VIS_LD_U8_I(tab0, s4);
		t6 = VIS_LD_U8_I(tab1, s3);
		t5 = VIS_LD_U8_I(tab0, s3);
		t4 = VIS_LD_U8_I(tab1, s2);
		t3 = VIS_LD_U8_I(tab0, s2);
		t2 = VIS_LD_U8_I(tab1, s1);
		t1 = VIS_LD_U8_I(tab0, s1);
		t0 = VIS_LD_U8_I(tab1, s0);
		acc = vis_faligndata(t7, acc);
		acc = vis_faligndata(t6, acc);
		acc = vis_faligndata(t5, acc);
		acc = vis_faligndata(t4, acc);
		acc = vis_faligndata(t3, acc);
		acc = vis_faligndata(t2, acc);
		acc = vis_faligndata(t1, acc);
		acc = vis_faligndata(t0, acc);
		s0 = s4;
		(*dp++) = acc;
	}

	num = ((mlib_u8 *)dend - (mlib_u8 *)dp) >> 1;
	sp += (num - 1);

	for (i = 0; i < num; i++) {
		s1 = (mlib_s32)*sp;
		sp--;

		t0 = VIS_LD_U8_I(tab1, s1);
		acc = vis_faligndata(t0, acc);

		t0 = VIS_LD_U8_I(tab0, s1);
		acc = vis_faligndata(t0, acc);
	}

	t0 = VIS_LD_U8_I(tab1, s0);
	acc = vis_faligndata(t0, acc);
	emask = vis_edge8(dp, dend);
	vis_pst_8(acc, dp, emask);
}
void
mlib_v_ImageLookUpSI_S16_U8_4_DstOff2_D1(
    const mlib_s16 *src,
    mlib_u8 *dst,
    mlib_s32 xsize,
    const mlib_u8 **table)
{
/* pointer to source data */
	mlib_s16 *sp;

/* source data */
	mlib_s32 s0, s1, s2;

/* pointer to start of destination */
	mlib_u8 *dl;

/* aligned pointer to destination */
	mlib_d64 *dp;

/* destination data */
	mlib_d64 t0, t1, t2;

/* destination data */
	mlib_d64 t3, t4, t5;

/* destination data */
	mlib_d64 t6, t7, acc0;

/* loop variable */
	mlib_s32 i;
	const mlib_u8 *tab0 = &table[0][32768];
	const mlib_u8 *tab1 = &table[1][32768];
	const mlib_u8 *tab2 = &table[2][32768];
	const mlib_u8 *tab3 = &table[3][32768];
	mlib_d64 acc1;

	sp = (void *)src;
	dl = dst;
	dp = (mlib_d64 *)dl;

	vis_alignaddr((void *)0, 7);

	s0 = (*sp++);

	if (xsize >= 2) {

		s1 = sp[0];
		s2 = sp[1];
		sp += 2;

		vis_write_bmask(0x012389ab, 0);

#pragma pipeloop(0)
		for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
			t7 = VIS_LD_U8_I(tab1, s2);
			t6 = VIS_LD_U8_I(tab0, s2);
			t5 = VIS_LD_U8_I(tab3, s1);
			t4 = VIS_LD_U8_I(tab2, s1);
			t3 = VIS_LD_U8_I(tab1, s1);
			t2 = VIS_LD_U8_I(tab0, s1);
			t1 = VIS_LD_U8_I(tab3, s0);
			t0 = VIS_LD_U8_I(tab2, s0);
			acc1 = vis_faligndata(t7, acc1);
			acc1 = vis_faligndata(t6, acc1);
			acc1 = vis_faligndata(t5, acc1);
			acc1 = vis_faligndata(t4, acc1);
			acc0 = vis_faligndata(t3, acc0);
			acc0 = vis_faligndata(t2, acc0);
			acc0 = vis_faligndata(t1, acc0);
			acc0 = vis_faligndata(t0, acc0);
			s0 = s2;
			s1 = sp[0];
			s2 = sp[1];
			(*dp++) = vis_bshuffle(acc0, acc1);
		}

		t7 = VIS_LD_U8_I(tab1, s2);
		t6 = VIS_LD_U8_I(tab0, s2);
		t5 = VIS_LD_U8_I(tab3, s1);
		t4 = VIS_LD_U8_I(tab2, s1);
		t3 = VIS_LD_U8_I(tab1, s1);
		t2 = VIS_LD_U8_I(tab0, s1);
		t1 = VIS_LD_U8_I(tab3, s0);
		t0 = VIS_LD_U8_I(tab2, s0);
		acc1 = vis_faligndata(t7, acc1);
		acc1 = vis_faligndata(t6, acc1);
		acc1 = vis_faligndata(t5, acc1);
		acc1 = vis_faligndata(t4, acc1);
		acc0 = vis_faligndata(t3, acc0);
		acc0 = vis_faligndata(t2, acc0);
		acc0 = vis_faligndata(t1, acc0);
		acc0 = vis_faligndata(t0, acc0);
		s0 = s2;
		(*dp++) = vis_bshuffle(acc0, acc1);
	}

	dl = (mlib_u8 *)dp;

	if ((xsize & 1) != 0) {
		s1 = sp[0];
		t7 = VIS_LD_U8_I(tab1, s1);
		t6 = VIS_LD_U8_I(tab0, s1);
		t5 = VIS_LD_U8_I(tab3, s0);
		t4 = VIS_LD_U8_I(tab2, s0);
		acc0 = vis_faligndata(t7, acc0);
		acc0 = vis_faligndata(t6, acc0);
		acc0 = vis_faligndata(t5, acc0);
		acc0 = vis_faligndata(t4, acc0);
		*(mlib_f32 *)dl = vis_read_hi(acc0);
		dl += 4;
		s0 = s1;
	}

	dl[0] = tab2[s0];
	dl[1] = tab3[s0];
}