示例#1
0
void
mlib_v_ImageSquare_U16_A8D2X4(
    mlib_u16 *src,
    mlib_s32 slb,
    mlib_u16 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize)
{
/* pointer to source image */
	mlib_d64 *sp;

/* pointer to a line in source */
	mlib_d64 *sl;

/* pointer to destination image */
	mlib_d64 *dp;

/* pointer to a line in destination */
	mlib_d64 *dl;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 i, j;

	mlib_d64 xor_mask = vis_to_double_dup(0x80008000);
	mlib_d64 sat_offset = vis_to_double_dup(0x40004000);

	sl = sp = (mlib_d64 *)src;
	dl = dp = (mlib_d64 *)dst;

/* row loop */
	for (j = 0; j < ysize; j++) {

/* 4-pixel column loop */
#pragma pipeloop(0)
		for (i = 0; i < (xsize / 4); i++) {
			sd = (*sp++);
			MLIB_V_IMAGESQUARE_U16(sd, dd);
			(*dp++) = dd;
		}

		sl = sp = (mlib_d64 *)((mlib_u8 *)sl + slb);
		dl = dp = (mlib_d64 *)((mlib_u8 *)dl + dlb);
	}
}
示例#2
0
void
mlib_v_ImageSqrShift_U16_A8D1X4(
    mlib_u16 *src,
    mlib_u16 *dst,
    mlib_s32 dsize,
    mlib_s32 shift)
{
/* pointer to source images */
	mlib_d64 *sp;

/* pointer to destination image */
	mlib_d64 *dp;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdhh, rdhl;

/* temporaries used in macro */
	mlib_d64 rdlh, rdll;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 j;

	mlib_d64 mask = vis_to_double_dup(0xfffefffe);
	mlib_f32 fmin = vis_to_float(0x80808080);
	mlib_d64 negate = vis_to_double_dup(0x7FFF7FFF);
	mlib_d64 xor_mask = vis_to_double_dup(0x80008000);
	mlib_d64 sat_offset = vis_to_double_dup(0x20000000 >> (16 - shift));

	sp = (mlib_d64 *)src;
	dp = (mlib_d64 *)dst;

/* 4-pixel loop */
#pragma pipeloop(0)
	for (j = 0; j < (dsize / 4); j++) {
		sd = (*sp++);
		MLIB_V_IMAGESQRSHIFT_U16(sd, dd);
		(*dp++) = dd;
	}
}
示例#3
0
void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, dmask, dFF;
    mlib_s32 i, i0, j, x, mask;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    dmask = vis_to_double_dup(0xFFFFFF);
    dFF = vis_to_double_dup(0xFFFFFFFF);

    for (j = 0; j < height; j++) {
	mlib_s32 *src = srcBase;
	mlib_s32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    mlib_u8 *pp0 = (mlib_u8*)(src + i);
	    mlib_u8 *pp1 = (mlib_u8*)(src + i + 1);
	    dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1);
	    dd = vis_fand(dd, dmask);
#if 1
	    mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3);
	    *(mlib_d64*)(dst + i) = dd;
	    vis_pst_8(dFF, dst + i, mask);
#else
	    mask = ((*pp0 & 1) << 1) | (*pp1 & 1);
	    dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]);
	    *(mlib_d64*)(dst + i) = dd;
#endif
	}

	if (i < width) {
	    x = src[i];
	    dst[i] = (x << 7) >> 7;
	}
示例#4
0
void mlib_v_ImageClear_BIT_1(mlib_image     *img,
                             const mlib_s32 *color)
{
  mlib_u8 *pimg = (mlib_u8 *) mlib_ImageGetData(img);
  mlib_s32 img_height = mlib_ImageGetHeight(img);
  mlib_s32 img_width = mlib_ImageGetWidth(img);
  mlib_s32 img_stride = mlib_ImageGetStride(img);
  mlib_s32 img_bitoff = mlib_ImageGetBitOffset(img);
  mlib_s32 i, j, b_j, k;
  mlib_u8 bcolor0, bmask, emask, src;
  mlib_d64 dcolor, *dpimg;
  mlib_u32 color0;

  if (img_width == img_stride * 8) {
    img_width *= img_height;
    img_height = 1;
  }

  color0 = ((color[0] & 1) << 31) >> 31;
  bcolor0 = color0 & 0xFF;

  dcolor = vis_to_double_dup(color0);
  for (i = 0, j = 0; i < img_height; i++) {
    mlib_u8 *pimg_row = pimg + i * img_stride, *pimg_row_end;

    if (img_bitoff + img_width <= 8) {
      bmask = (0xFF >> (8 - img_width)) << (8 - img_bitoff - img_width);
      src = pimg_row[0];
      pimg_row[0] = (src & ~bmask) | (color0 & bmask);
      continue;
    }
    else {
void
mlib_ImageLineXor8000(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_d64 *dptr;
	mlib_d64 *sptr;
	mlib_d64 s0, s1;
	mlib_d64 mask8000 = vis_to_double_dup(0x80008000);
	mlib_s32 j;
	mlib_s32 emask;

/* prepare the destination addresses */
	dptr = (mlib_d64 *)((mlib_addr)dst & (~7));
	j = (mlib_addr)dptr - (mlib_addr)dst;
	dend = (mlib_u8 *)dst + size - 1;

/* prepare the source address */
	sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j);
/* generate edge mask for the start point */
	emask = vis_edge8(dst, dend);

	s1 = vis_ld_d64_nf(sptr);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		s0 = vis_fxor(vis_faligndata(s0, s1), mask8000);
		vis_pst_8(s0, dptr++, emask);
		sptr++;
		j += 8;
	}

#pragma pipeloop(0)
	for (; j <= (size - 16); j += 8) {
		s0 = s1;
		s1 = sptr[1];
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
	}

	if (j <= (size - 8)) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
		j += 8;
	}

	if (j < size) {
		s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)),
		    mask8000);
		emask = vis_edge8(dptr, dend);
		vis_pst_8(s0, dptr, emask);
	}
}
void
mlib_v_ImageAffineTableLine_8nw_3_2_1(
    mlib_d64 *buff,
    const mlib_d64 *filterX,
    const mlib_d64 *filterY,
    const mlib_u8 **lineAddr,
    mlib_affine_workspace *ws)
{
	DECLAREVAR;
	DECLAREVAR2;
	mlib_d64 yFilter2;
	mlib_d64 yFilter3;
	mlib_d64 row20, row30;
	mlib_d64 *dpSrc;
	mlib_d64 data0, data1, zero;

	vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4);
	dstPixelPtr = (mlib_s16 *)buff;

	zero = vis_to_double_dup(0);

#pragma pipeloop(0)
	for (i = 0; i <= size - 2; i += 2) {
		CALC_2_SRC_PTR;
		LOAD_3x2;
		FILTER_MERGE_4x2;
		MAKE_4x2;
		*buff1 = res1;
		buff1++;
	}

	dstPixelPtr = (mlib_s16 *)buff1;

	for (; i < size; i++) {
		CALC_SRC_PTR(sPtr);
		LOAD_FILTERS(fx0, yFilter);
		xFilter = vis_write_hi(xFilter, fx0);
		LOAD_PIXEL_3;

		v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter));
		v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter));
		sum = vis_fpadd16(v0, v1);
		v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter));
		sum = vis_fpadd16(v0, sum);

		v0 = vis_fmul8sux16(sum, xFilter);
		v1 = vis_fmul8ulx16(sum, xFilter);
		v3 = vis_fpadd16(v1, v0);
		v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3));
		res =
		    vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2),
		    vis_read_lo(v2)));

		vis_st_u16(res, dstPixelPtr++);
	}
}
static void
mlib_VolumeWindowLevel1(
	mlib_u8 *dst,
	const mlib_s16 *src,
	mlib_s32 window,
	mlib_s32 level,
	mlib_s32 gmax,
	mlib_s32 gmin,
	mlib_s32 len)
{
	INIT_VARS;
	mlib_s32 ia4[1];
	mlib_f32 A;

	while (2 * a < 1 && scale < 7) {
		a *= 2;
		scale++;
	}

	vis_write_gsr((7 - scale) << 3);

	ia = a * 256.0 + 0.5;

	if (ia > MLIB_U8_MAX)
		ia = MLIB_U8_MAX;
	ia4[0] = (ia << 24) | (ia << 16) | (ia << 8) | ia;
	A = *(mlib_f32 *)ia4;

	dgmin = vis_to_double_dup(((gmin << 16) | gmin) << scale);

	if (window >= (1 << 15)) {
		dwin = 0;
		ia = ((gmax + gmin) << scale) * 0.5 - level * a;
		dgmin = vis_to_double_dup((ia << 16) | (ia & 0xFFFF));
	}

	PRE_LOOP(MLIB_CALC1);

#pragma pipeloop(0)
	MAIN_LOOP(MLIB_CALC1);

	END_LOOP(MLIB_CALC1);
}
DEF_FUNC(mlib_ImageBlendColor_U8, mlib_u8,
    mlib_s32)
{
	mlib_f32 fzeros = vis_fzeros();
	mlib_f32 fmax = vis_to_float(0xFFFFFFFF);
	mlib_d64 dmask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 done = vis_to_double_dup(0x01000100);
	mlib_d64 *buffs, *buffd;
	mlib_d64 *sp, *dp;
	mlib_f32 *alp_tbl;
	mlib_d64 ss, s1, rr, tt, d0, d1;
	mlib_d64 cc, c0, c1, c2;
	mlib_d64 amask0, amask1, amask2;
	mlib_s32 ww, dflag, i, j;

	vis_write_gsr(7 << 3);

	width *= channel;
	ww = (width + 7) / 8;

	if (channel == 3) {
		ww = 3 * ((ww + 2) / 3);
	}

	buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww);

	if (buffs == NULL) {
		return (MLIB_FAILURE);
	}

	buffd = buffs + ww;

	if (channel == 4) {
		cc = DOUBLE_4U16(color[0], color[1], color[2], color[3]);
		cc = vis_fand(vis_for(cc,
		    ((mlib_d64 *)mlib_dmask_arr)[8 >> alpha]), dmask);
		alp_tbl = (mlib_f32 *)mlib_alp_tbl + alpha * 256;
	} else if (channel == 3) {
示例#9
0
void
mlib_v_ImageSquare_U16_A8D1X4(
    mlib_u16 *src,
    mlib_u16 *dst,
    mlib_s32 dsize)
{
/* pointer to source image */
	mlib_d64 *sp;

/* pointer to destination image */
	mlib_d64 *dp;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 j;

	mlib_d64 xor_mask = vis_to_double_dup(0x80008000);
	mlib_d64 sat_offset = vis_to_double_dup(0x40004000);

	sp = (mlib_d64 *)src;
	dp = (mlib_d64 *)dst;

/* 4-pixel loop */
#pragma pipeloop(0)
	for (j = 0; j < (dsize / 4); j++) {
		sd = (*sp++);
		MLIB_V_IMAGESQUARE_U16(sd, dd);
		(*dp++) = dd;
	}
}
示例#10
0
static void
mlib_VolumeWindowLevel2(
	mlib_u8 *dst,
	const mlib_s16 *src,
	mlib_s32 window,
	mlib_s32 level,
	mlib_s32 gmax,
	mlib_s32 gmin,
	mlib_s32 len)
{
	INIT_VARS;
	mlib_d64 A;

	while (2 * a < (1 << 7) && scale < 7) {
		a *= 2;
		scale++;
	}

	vis_write_gsr((7 - scale) << 3);
	WRITE_BMASK(0x13579BDF);

	ia = a * 256.0 + 0.5;

	if (ia > MLIB_S16_MAX)
		ia = MLIB_S16_MAX;
	A = vis_to_double_dup((ia << 16) | (ia & 0xFFFF));

	dgmin = vis_to_double_dup(((gmin << 16) | gmin) << scale);

	PRE_LOOP(MLIB_CALC2);

#pragma pipeloop(0)
	MAIN_LOOP(MLIB_CALC2);

	END_LOOP(MLIB_CALC2);
}
示例#11
0
void
mlib_v_ImageSqrSmallShift_U16_A8D1X4(
    mlib_u16 *src,
    mlib_u16 *dst,
    mlib_s32 dsize,
    mlib_s32 shift)
{
/* pointer to source images */
	mlib_d64 *sp;

/* pointer to destination image */
	mlib_d64 *dp;

/* source data */
	mlib_d64 sd;

/* destination data */
	mlib_d64 dd;

/* temporaries used in macro */
	mlib_d64 rdhh, rdhl;

/* temporaries used in macro */
	mlib_d64 rdlh, rdll;

/* temporaries used in macro */
	mlib_d64 rdh, rdl;

/* loop variable */
	mlib_s32 j;

	mlib_d64 sdad, rdh_0, rdh_1, rdl_0, rdl_1;
	mlib_d64 offset = vis_to_double_dup(0x80008000);
	mlib_d64 dscale = (mlib_d64)(0x10000 >> shift);
	mlib_d64 sat_offset =
	    (mlib_d64)(0x40000000) * dscale - ((mlib_d64)0x80000000);

	sp = (mlib_d64 *)src;
	dp = (mlib_d64 *)dst;

/* 4-pixel loop */
#pragma pipeloop(0)
	for (j = 0; j < (dsize / 4); j++) {
		sd = (*sp++);
		MLIB_V_IMAGESQRSMALLSHIFT_U16(sd, dd);
		(*dp++) = dd;
	}
}
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_f32 *pd = (mlib_f32 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp0, tmp1, data;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, bias = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);

#pragma pipeloop(0)
	for (i = 0; i <= n - 8; i += 8) {
		d0 = (*sp0++);
		tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0));
		tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0));

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16(data);
	}

	dst = (mlib_u8 *)pd;

	for (; i < n; i += 2) {
		(*dst++) = (src[i] + src[i + 1] + bias) >> 1;
/* 1=>2, 2=>1 */
		bias ^= 1;
	}

	return (MLIB_SUCCESS);
}
示例#13
0
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 dd, mask;
    mlib_s32 i, i0, j;

    if (dstScan == 4*width && srcScan == 4*width) {
	width *= height;
	height = 1;
    }

    mask = vis_to_double_dup(0xFF000000);
    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
	mlib_u32 *src = srcBase;
	mlib_u32 *dst = dstBase;

	i = i0 = 0;

	if ((mlib_s32)dst & 7) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	    i0 = 1;
	}

#pragma pipeloop(0)
	for (i = i0; i <= (mlib_s32)width - 2; i += 2) {
	    dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]);
	    dd = vis_faligndata(dd, dd);
	    *(mlib_d64*)(dst + i) = vis_for(dd, mask);
	}

	if (i < width) {
	    dst[i] = 0xff000000 | (src[i] >> 8);
	}
示例#14
0
mlib_status
__mlib_VectorConvert_S16_S8_Mod(
	mlib_s16 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s32 i;
	const mlib_s8 *src = x;
	mlib_s16 *dst = z;
	mlib_d64 *ddsrc, *ddst;
	mlib_d64 four_16_ones = vis_to_double_dup(0x01000100);
	mlib_f32 fzero = vis_fzeros();
	mlib_s32 len_64, even_length, rest_64, length = n, off;
	mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7;

	if (length < 16) {
		EXPAND(mlib_s8, mlib_s16);
	}

	while ((mlib_addr)dst & 7) {
		(*dst++) = (*src++);
		length--;
	}

	ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;
	dd2 = ddsrc[0];
	off = (mlib_addr)src & 7;

	if (!off) {

/*
 * Both vectors are 64-bit aligned.
 */

/*
 * Peeling of 1 iteration.
 */

		if (i = (len_64 & 1)) {
			dd1 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i < len_64; i += 2) {
			dd1 = (*ddsrc++);
			dd2 = (*ddsrc++);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2),
				fzero), four_16_ones);
			(*ddst++) =
				vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2),
				fzero), four_16_ones);
		}
	} else {

/*
 * Source vector is not 64-bit aligned.
 * Peeling of 1 iteration. Then loop with step==2.
 */

		vis_alignaddr((void *)0, 1);
		vis_write_bmask(0x11111111 * off, 0x04152637);
		i = 1;

		if (len_64 & 1) {
			dd1 = dd2;
			dd2 = vis_ld_d64_nf(ddsrc + 1); i++;
			dd4 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
		}
#pragma pipeloop(0)
#pragma unroll(4)
		for (; i <= len_64; i += 2) {
			dd0 = dd2;
			dd1 = vis_ld_d64_nf(ddsrc + i);
			dd2 = vis_ld_d64_nf(ddsrc + i + 1);
			dd4 = vis_bshuffle(dd0, dd1);
			dd6 = vis_bshuffle(dd1, dd2);
			dd5 = vis_faligndata(dd4, dd4);
			dd7 = vis_faligndata(dd6, dd6);
			(*ddst++) = vis_fmul8sux16(dd4, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd5, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd6, four_16_ones);
			(*ddst++) = vis_fmul8sux16(dd7, four_16_ones);
		}
	}

	for (i = 0; i < rest_64; i++)
		dst[even_length + i] = src[even_length + i];

	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2RGB444_nonalign(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 rgb_stride,
	mlib_s32 yuv_stride)
{
/* all. pointer to y, u, v */
	mlib_d64 *spy, *dfu, *dfv;

/* y data */
	mlib_d64 dy0, dy1, dy3;
	mlib_d64 du, dv, du0, du1, dv0, dv1;

/* (1.1644, 1.5966)*8192 */
	mlib_f32 k12 = vis_to_float(0x25433317);

/* (-.3920, -.8132)*8192 */
	mlib_f32 k34 = vis_to_float(0xf375e5fa);

/* 2.0184*8192 */
	mlib_f32 k5 = vis_to_float(0x1004097);
	mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0);
	mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4);
	mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0);
	mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi;
	mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo;
	mlib_d64 y_11644_hi, y_11644_lo;
	mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo;
	mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2;

/* loop variable */
	mlib_s32 i, j;
	mlib_d64 *buf, BUFF[16 * 1024];
	mlib_u8 *tmp, *dp;

	if (width * 3 > 16 * 1024) {
		tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);

		if (tmp == NULL)
			return (MLIB_FAILURE);
		buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
	} else {
		buf = (mlib_d64 *)BUFF;
	}

	dp = (mlib_u8 *)buf;
	ddp = (mlib_d64 *)dp;

	for (j = 0; j < height; j++) {

		dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0);
		du0 = (*dfu++);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

		dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0);
		dv0 = (*dfv++);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
/* U*(-0.3920); */
		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
/* V*(-0.8132); */
		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);

		spy = (mlib_d64 *)vis_alignaddr((void *)y, 0);
		dy0 = (*spy++);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

/* U*2.0184 */
		u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
		g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

		u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
		g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
		v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
		g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

		v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
		g_lo = vis_fpadd16(g_lo, k_135_6352);

		vis_alignaddr((void *)u, 0);
		du1 = vis_ld_d64_nf(dfu); dfu++;
		du = vis_faligndata(du0, du1);
		du0 = du1;

/* Y*1.1644 */
		y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
		b_hi = vis_fpsub16(u_20184_hi, k_276_9856);

		vis_alignaddr((void *)v, 0);
		dv1 = vis_ld_d64_nf(dfv); dfv++;
		dv = vis_faligndata(dv0, dv1);
		dv0 = dv1;

/* Y*1.1644 */
		y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
		b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
		u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
		r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
		v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
		r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

		u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
		g_hi = vis_fpadd16(g_hi, y_11644_hi);

		v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
		g_lo = vis_fpadd16(g_lo, y_11644_lo);

		green = vis_fpack16_pair(g_hi, g_lo);
		b_hi = vis_fpadd16(b_hi, y_11644_hi);
		b_lo = vis_fpadd16(b_lo, y_11644_lo);

		blue = vis_fpack16_pair(b_hi, b_lo);
		r_hi = vis_fpadd16(r_hi, y_11644_hi);
		r_lo = vis_fpadd16(r_lo, y_11644_lo);

		red = vis_fpack16_pair(r_hi, r_lo);

		vis_alignaddr((void *)y, 0);
		dy3 = vis_ld_d64_nf(spy); spy++;
		dy1 = vis_faligndata(dy0, dy3);
		dy0 = dy3;

#pragma pipeloop(0)
		for (i = 0; i <= width - 8; i += 8) {

			vis_write_bmask(0x0801902A, 0);
			dd0 = vis_bshuffle(red, green);
			vis_write_bmask(0x03B04C05, 0);
			dd1 = vis_bshuffle(red, green);
			vis_write_bmask(0xD06E07F0, 0);
			dd2 = vis_bshuffle(red, green);
			vis_write_bmask(0x01834967, 0);
			ddp[0] = vis_bshuffle(dd0, blue);
			vis_write_bmask(0xA12B45C7, 0);
			ddp[1] = vis_bshuffle(dd1, blue);
			vis_write_bmask(0x0D23E56F, 0);
			ddp[2] = vis_bshuffle(dd2, blue);

/* U*2.0184 */
			u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5);
			g_hi = vis_fpadd16(u_3920_hi, v_8132_hi);

			u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5);
			g_hi = vis_fpadd16(g_hi, k_135_6352);

/* V*1.5966 */
			v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12);
			g_lo = vis_fpadd16(u_3920_lo, v_8132_lo);

			v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12);
			g_lo = vis_fpadd16(g_lo, k_135_6352);
			vis_alignaddr((void *)u, 0);
			du1 = vis_ld_d64_nf(dfu); dfu++;
			du = vis_faligndata(du0, du1);
			du0 = du1;

/* Y*1.1644 */
			y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12);
			b_hi = vis_fpsub16(u_20184_hi, k_276_9856);
			vis_alignaddr((void *)v, 0);
			dv1 = vis_ld_d64_nf(dfv); dfv++;
			dv = vis_faligndata(dv0, dv1);
			dv0 = dv1;

/* Y*1.1644 */
			y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12);
			b_lo = vis_fpsub16(u_20184_lo, k_276_9856);

/* U*(-0.3920); */
			u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34);
			r_hi = vis_fpsub16(v_15966_hi, k_222_9952);

/* V*(-0.8132); */
			v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34);
			r_lo = vis_fpsub16(v_15966_lo, k_222_9952);

			u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34);
			g_hi = vis_fpadd16(g_hi, y_11644_hi);

			v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34);
			g_lo = vis_fpadd16(g_lo, y_11644_lo);

			green = vis_fpack16_pair(g_hi, g_lo);
			b_hi = vis_fpadd16(b_hi, y_11644_hi);
			b_lo = vis_fpadd16(b_lo, y_11644_lo);

			blue = vis_fpack16_pair(b_hi, b_lo);
			r_hi = vis_fpadd16(r_hi, y_11644_hi);
			r_lo = vis_fpadd16(r_lo, y_11644_lo);

			red = vis_fpack16_pair(r_hi, r_lo);

			vis_alignaddr((void *)y, 0);
			dy3 = vis_ld_d64_nf(spy); spy++;
			dy1 = vis_faligndata(dy0, dy3);
			dy0 = dy3;

			ddp += 3;
		}

		dp = (mlib_u8 *)ddp;

		vis_alignaddr((void *)(width - i), 0);
		blue = vis_faligndata(blue, blue);
		green = vis_faligndata(green, green);
		red = vis_faligndata(red, red);
		dp += ((width - i - 1) * 3);

		vis_alignaddr((void *)spy, 7);
		for (; i < width; i++) {
			STORE_PIXEL(0, 1, 2);
			dp -= 3;
		}

		__mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3);

		rgb += rgb_stride;
		dp = (mlib_u8 *)buf;
		ddp = (mlib_d64 *)dp;
		y += yuv_stride;
		u += yuv_stride;
		v += yuv_stride;
	}

	if (width * 3 > 16 * 1024)
		__mlib_free(tmp);
	return (MLIB_SUCCESS);
}
void
mlib_v_VideoColorYUV2RGB444_all_align(
	mlib_u8 *rgb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros();
	mlib_s32 i, n, m, emask;
	mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4];
	mlib_d64 tmp_arr64[2];
	mlib_d64 k01 = vis_to_double_dup(0x0000f375);
	mlib_d64 k02 = vis_to_double_dup(0x3317e5fa);
	mlib_d64 k11 = vis_to_double_dup(0xf3754097);
	mlib_d64 k12 = vis_to_double_dup(0xe5fa0000);
	mlib_d64 k21 = vis_to_double_dup(0x40970000);
	mlib_d64 k22 = vis_to_double_dup(0x00003317);
	mlib_d64 c_0 = vis_to_double_dup(0xe42010f4);
	mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60);
	mlib_d64 c_2 = vis_to_double_dup(0xdd60e420);
	mlib_d64 k_0 = vis_to_double_dup(0x25432543);

	do {
/* loop on buffer size */

		if (size > 2 * BUFF_SIZE) {
			n = 2 * BUFF_SIZE;
		} else {
			n = size;
		}

		m = n >> 2;
		buff2 = pbuff_arr2;
		sf0 = (mlib_f32 *)y;
		sf1 = (mlib_f32 *)u;
		sf2 = (mlib_f32 *)v;
		dend = rgb + 3 * n - 1;
		pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

/*
 * merge buff values to 3-channel array
 */

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			pfd[0] = vis_read_hi(d_0145);
			pfd[1] = vis_read_hi(d_23xx);
			pfd[2] = vis_read_lo(d_0145);

			buff2 += 2;
			pfd += 3;
		}

		if ((mlib_u8 *)pfd <= dend) {
			mlib_d64 d_0235, d_xx14, d_23xx, d_0145;
			mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

			mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
				s_0;
			mlib_f32 x0, x1, x2;

			x0 = (*sf0++);
			x1 = (*sf1++);
			x2 = (*sf2++);

			s_0 = vis_fmul8x16(x0, k_0);
			s01 = vis_fmul8x16(x1, k01);
			s11 = vis_fmul8x16(x1, k11);
			s21 = vis_fmul8x16(x1, k21);
			s02 = vis_fmul8x16(x2, k02);
			s12 = vis_fmul8x16(x2, k12);
			s22 = vis_fmul8x16(x2, k22);

			s00 = vis_fpadd16(s_0, s01);
			s10 = vis_fpadd16(s_0, s11);
			s20 = vis_fpadd16(s_0, s21);

			s02 = vis_fpadd16(s02, c_0);
			s12 = vis_fpadd16(s12, c_1);
			s22 = vis_fpadd16(s22, c_2);

			s00 = vis_fpadd16(s00, s02);
			s10 = vis_fpadd16(s10, s12);
			s20 = vis_fpadd16(s20, s22);

			d_0235 = vis_fpmerge(vis_fpack16(s00),
				vis_fpack16(s10));
			d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20));

			d_23xx = vis_faligndata(d_0235, d_0235);
			d_0145 = vis_bshuffle(d_0235, d_xx14);

			emask = vis_edge8(pfd, dend);

			if ((mlib_addr)pfd & 7) {
				pfd--;
				tmp_arr32++;
			}

			tmp_arr32[0] = vis_read_hi(d_0145);
			tmp_arr32[1] = vis_read_hi(d_23xx);
			tmp_arr32[2] = vis_read_lo(d_0145);

			vis_pst_8(tmp_arr64[0], pfd, emask);

			pfd += 2;
			emask = vis_edge8(pfd, dend);

			if ((mlib_u8 *)pfd <= dend)
				vis_pst_8(tmp_arr64[1], pfd, emask);
		}

		y += n;
		u += n;
		v += n;
		rgb += 3 * n;
		size -= n;
	} while (size);
}
mlib_status
__mlib_VideoDownSample422(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 n)
{
	mlib_d64 *sp0 = (mlib_d64 *)src;
	mlib_d64 *pd = (mlib_d64 *)dst;
	mlib_d64 d0;
	mlib_d64 tmp, data0, data1;
	mlib_d64 acc0_hi, acc0_lo;
	mlib_d64 round = vis_to_double_dup(0x1);
	mlib_f32 fone = vis_to_float(0x1000000);
	mlib_s32 i, edge;

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(6 << 3);
	vis_write_bmask(0x02461357, 0);

#pragma pipeloop(0)
	for (i = 0; i <= n - 16; i += 16) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		(*pd++) = vis_fpack16_pair(data0, data1);
	}

	if (i < n) {
		d0 = (*sp0++);
		tmp = vis_bshuffle(d0, d0);

		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data0 = vis_fpadd16(acc0_hi, round);

		d0 = vis_ld_d64_nf(sp0);
		tmp = vis_bshuffle(d0, d0);
		acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone);
		acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone);

		acc0_hi = vis_fpadd16(acc0_hi, acc0_lo);
		data1 = vis_fpadd16(acc0_hi, round);

		edge = vis_edge8(pd, (dst + (n / 2) - 1));
		vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge);
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorJFIFYCC2RGB444(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *cb,
    const mlib_u8 *cr,
    mlib_s32 size)
{
    mlib_u8 *dend;
    mlib_f32 *sf0, *sf1, *sf2, *pfd;
    mlib_f32 fzero = vis_fzeros();
    mlib_s32 i, n, m, emask;
    mlib_d64 tmp_arr64[2];
    mlib_d64 k01 = vis_to_double_dup(0x0000f4fd);
    mlib_d64 k02 = vis_to_double_dup(0x2cdde926);
    mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4);
    mlib_d64 k12 = vis_to_double_dup(0xe9260000);
    mlib_d64 k21 = vis_to_double_dup(0x38b40000);
    mlib_d64 k22 = vis_to_double_dup(0x00002cdd);
    mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff);
    mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6);
    mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1);
    mlib_d64 k_0 = vis_to_double_dup(0x20002000);

    if (size <= 0)
        return (MLIB_FAILURE);

    vis_write_gsr((2 << 3) + 2);
    vis_write_bmask(0x0489AB37, 0);

    do {
        /* loop on buffer size */

        if (size > 2 * BUFF_SIZE) {
            n = 2 * BUFF_SIZE;
        } else {
            n = size;
        }

        m = (n - 1) >> 2;
        sf0 = (mlib_f32 *)y;
        sf1 = (mlib_f32 *)cb;
        sf2 = (mlib_f32 *)cr;
        dend = rgb + 3 * n - 1;
        pfd = (mlib_f32 *)rgb;

#pragma pipeloop(0)
#pragma unroll(4)
        for (i = 0; i < m; i++) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_0145;
            mlib_f32 x0, x1, x2;

            x0 = (*sf0++);
            x1 = (*sf1++);
            x2 = (*sf2++);

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            s20 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, s20);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            pfd[0] = vis_read_hi(d_0145);
            pfd[1] = vis_read_hi(d_0235);
            pfd[2] = vis_read_lo(d_0145);

            pfd += 3;
        }

        /*
         * last pixels
         */

        if ((mlib_u8 *)pfd <= dend) {
            mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22,
                     s_0;
            mlib_d64 d_0235, d_xx14, d_0145;
            mlib_f32 x0, x1, x2;
            mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64;

            x0 = *sf0;
            x1 = *sf1;
            x2 = *sf2;

            s_0 = vis_fmul8x16(x0, k_0);
            s01 = vis_fmul8x16(x1, k01);
            s11 = vis_fmul8x16(x1, k11);
            s21 = vis_fmul8x16(x1, k21);
            s02 = vis_fmul8x16(x2, k02);
            s12 = vis_fmul8x16(x2, k12);
            s22 = vis_fmul8x16(x2, k22);

            s00 = vis_fpadd16(s_0, s01);
            s10 = vis_fpadd16(s_0, s11);
            s20 = vis_fpadd16(s_0, s21);

            s02 = vis_fpadd16(s02, c_0);
            s12 = vis_fpadd16(s12, c_1);
            s22 = vis_fpadd16(s22, c_2);

            s00 = vis_fpadd16(s00, s02);
            s10 = vis_fpadd16(s10, s12);
            s20 = vis_fpadd16(s20, s22);

            d_0235 = vis_fpack16_pair(s00, s10);
            d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero);

            d_0145 = vis_bshuffle(d_0235, d_xx14);
            d_0235 = vis_fpack32(d_0235, d_0235);
            d_0235 = vis_fpmerge(vis_read_hi(d_0235),
                                 vis_read_lo(d_0235));

            emask = vis_edge8(pfd, dend);

            if ((mlib_addr)pfd & 7) {
                pfd--;
                tmp_arr32++;
            }

            tmp_arr32[0] = vis_read_hi(d_0145);
            tmp_arr32[1] = vis_read_hi(d_0235);
            tmp_arr32[2] = vis_read_lo(d_0145);

            vis_pst_8(tmp_arr64[0], pfd, emask);

            pfd += 2;
            emask = vis_edge8(pfd, dend);

            if ((mlib_u8 *)pfd <= dend)
                vis_pst_8(tmp_arr64[1], pfd, emask);
        }

        y += n;
        cb += n;
        cr += n;
        rgb += 3 * n;
        size -= n;

    } while (size);

    return (MLIB_SUCCESS);
}
void
mlib_v_ImageMulSmallShift_U16(
    mlib_s16 *sp1,
    mlib_s32 stride1,
    mlib_s16 *sp2,
    mlib_s32 stride2,
    mlib_s16 *dp,
    mlib_s32 strided,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 shift)
{
/* pointers for line of source1 */
	mlib_s16 *sl1;

/* pointers for line of source2 */
	mlib_s16 *sl2;

/* pointers for line of dst */
	mlib_s16 *dl;
	mlib_s32 offdst, offsrc1, offsrc2, emask;
	mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr;
	mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21;
	mlib_s16 *dend;
	mlib_d64 rdhh, rdhl;
	mlib_d64 rdlh, rdll;
	mlib_d64 rdh, rdl;
	mlib_s32 i, j, k;

	mlib_d64 sd1, sd2, sd1ad, sd2ad, rdh_0, rdh_1, rdl_0, rdl_1;
	mlib_d64 offset = vis_to_double_dup(0x80008000);
	mlib_d64 half_offset = vis_to_double(0x40004000, 0x80008000);
	mlib_d64 const_offset = vis_to_double_dup(0x20000000);

	const_offset =
	    vis_fpsub32(const_offset,
	    vis_to_double_dup(0x40000000 >> (16 - shift)));

	if (width == stride1 && width == stride2 && width == strided) {
		width *= height;
		height = 1;
	}

/* initialize GSR scale factor */
	vis_write_gsr(((16 - (shift - 1)) & 0x1f) << 3);

	sl1 = sp1;
	sl2 = sp2;
	dl = dp;

	offdst = ((mlib_addr)dp) & 7;
	offsrc1 = ((mlib_addr)sp1) & 7;
	offsrc2 = ((mlib_addr)sp2) & 7;

	if ((offdst == offsrc1) && (offdst == offsrc2) &&
	    (((strided ^ stride1) & 3) == 0) &&
	    (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				(*dpp++) = dd;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd20 = spp2[0];

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd20 = sd21;
				spp2++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the source addresses */
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0);
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = spp1[0];

			if (emask != 0xf) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				sd10 = sd11;
				spp1++;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd20 = (*spp2++);
				sd11 = spp1[1];
				sd10 = vis_faligndata(sd10, sd11);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd10 = sd11;
				spp1++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = (*spp2++);
				sd10 = vis_faligndata(sd10, spp1[1]);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) {
/* printf("4:\n"); */
		for (j = 0; j < height; j++) {

/* prepare the source addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

/* prepare the destination addresses */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			sd10 = (*spp1++);
			sd20 = (*spp2++);
			MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd0);

			if (emask != 0xf) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp++, emask);
				dd0 = dd1;
				i += 4;
			}
#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1);
				(*dpp++) = vis_faligndata(dd0, dd1);
				dd0 = dd1;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd10 = (*spp1++);
				sd20 = (*spp2++);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1);
				dd = vis_faligndata(dd0, dd1);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	} else {

		for (j = 0; j < height; j++) {

/* prepare the destination addresses */
			dpp = (mlib_d64 *)vis_alignaddr(dp, 0);
			i = (mlib_s16 *)dpp - dp;

			dend = dp + width - 1;
/* generate edge mask for the start point */
			emask = vis_edge16(dp, dend);

			if (emask != 0xf) {
				spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
				sd10 = vis_faligndata(spp1[0], spp1[1]);
				spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
				sd20 = vis_faligndata(spp2[0], spp2[1]);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				vis_pst_16(dd, dpp++, emask);
				i += 4;
			}

/* copy src1 to dst */
			spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i);
			sd11 = spp1[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (k = i; k <= (width - 4); k += 4) {
				sd10 = sd11;
				sd11 = spp1[1];
				(*tmp_ptr++) = vis_faligndata(sd10, sd11);
				spp1++;
			}

			sd11 = vis_faligndata(sd11, spp1[1]);

			spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i);
			sd20 = spp2[0];
			tmp_ptr = dpp;

#pragma pipeloop(0)
			for (; i <= width - 4; i += 4) {
				sd10 = (*tmp_ptr++);
				sd21 = spp2[1];
				sd20 = vis_faligndata(sd20, sd21);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd);
				(*dpp++) = dd;
				sd20 = sd21;
				spp2++;
			}

			if (i < width) {
				emask = vis_edge16(dpp, dend);
				sd20 = vis_faligndata(sd20, spp2[1]);
				MLIB_V_IMAGEMULSHIFTONE_U16(sd11, sd20, dd);
				vis_pst_16(dd, dpp, emask);
			}

			sp1 = sl1 += stride1;
			sp2 = sl2 += stride2;
			dp = dl += strided;
		}
	}
}
示例#20
0
mlib_status
__mlib_VectorConjRev_S8C_S8C_Sat(
	mlib_s8 *zz,
	const mlib_s8 *xx,
	mlib_s32 n)
{
	const mlib_s8 *x = xx;
	mlib_s8 *z = zz;
	mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n);
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dcntr0 = vis_to_double_dup(0x00800080);
	mlib_d64 dxor0 = vis_to_double_dup(0x007f007f);
	mlib_d64 done = vis_to_double_dup(1);
	mlib_s8 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;
	mlib_f32 f_null = vis_to_float(0);

	CHECK(x, z);

	if (n < 8) {
		CONJREVC(mlib_s8,
			MLIB_S8_MAX,
			MLIB_S8_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S8_MIN)
			*--dst = MLIB_S8_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(7 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 7;
	len_64 = length >> 3;
	even_length = len_64 << 3;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 1);
			d_rest = vis_to_double((re_part << 24), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ8;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_8(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ8;
				*--ddst = d4;
			}

			vis_write_gsr(1);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_8(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S8_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S8_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
示例#21
0
mlib_status
__mlib_VectorConjRev_S16C_S16C_Sat(
	mlib_s16 *zz,
	const mlib_s16 *xx,
	mlib_s32 n)
{
	mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz;
	mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n];
	mlib_d64 *dsrc, *ddst;
	mlib_d64 d1, d2, d3, d4, dl, dh, d_rest;
	mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo =
		vis_to_double(0, 2);
	mlib_f32 f_two = vis_to_float(0x20002);
	mlib_s16 c;
	mlib_s32 i, rest_64, len_64, even_length, odd = 0, length =
		(mlib_s32)n * 2;
	mlib_s32 re_part;

	CHECK(x, z);

	if ((n < 16)) {
		CONJREVC(mlib_s16,
			MLIB_S16_MAX,
			MLIB_S16_MIN);
	}

	while (((mlib_addr)dst) & 7) {

		if ((c = src[1]) == MLIB_S16_MIN)
			*--dst = MLIB_S16_MAX;
		else
			*--dst = -c;
		length -= 2;
		src += 2;

		if (((mlib_addr)dst) & 7) {
			*--dst = src[-2];
		} else {
			re_part = src[-2];
			odd = 1;
			break;
		}
	}

	vis_write_gsr(15 << 3);
	ddst = (mlib_d64 *)dst;
	rest_64 = length & 3;
	len_64 = length >> 2;
	even_length = len_64 << 2;

	if (!odd) {

/*
 * Aligning loop finished with imaginary part. The following processing
 * starts with real part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)src;

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = d4;
			}
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}
		}
	} else {

/*
 * Aligning loop finished with real part. Th following processing
 * starts with imaginary part.
 */

		if (!((mlib_addr)src & 7)) {

/*
 * Src address is 8-byte aligned.
 */

			dsrc = (mlib_d64 *)vis_alignaddr(src, 2);
			d_rest = vis_to_double((re_part << 16), 0);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d3 = (*dsrc++);
				CONJ16;
				*--ddst = vis_faligndata(d4, d_rest);
				d_rest = d4;
			}

			ddst--;
			d_rest = vis_faligndata(d_rest, d_rest);
			vis_pst_16(d_rest, ddst, 0x1);
		} else {

			dsrc = (mlib_d64 *)vis_alignaddr(src, 0);
			d2 = (*dsrc++);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = (*dsrc++);
				d3 = vis_faligndata(d1, d2);
				CONJ16;
				*--ddst = d4;
			}

			vis_write_gsr(2);
			d2 = *ddst;
			d3 = vis_faligndata(d1, d2);
			vis_pst_16(d3, (ddst - 1), 0x1);

#pragma pipeloop(0)
			for (i = 0; i < len_64; i++) {
				d1 = d2;
				d2 = *(ddst + 1);
				(*ddst++) = vis_faligndata(d1, d2);
			}

			dst[-1] = re_part;
		}

		dst--;
	}

	if (!rest_64)
		return (MLIB_SUCCESS);

	for (i = 0; i < rest_64; i += 2) {
		dst[-even_length - 2 - i] = src[even_length + i];

		if ((c = src[even_length + i + 1]) == MLIB_S16_MIN)
			dst[-even_length - 2 - i + 1] = MLIB_S16_MAX;
		else
			dst[-even_length - 2 - i + 1] = -c;
	}

	return (MLIB_SUCCESS);
}
mlib_status FUNC(
    MxN) (
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 m,
    mlib_s32 n,
    mlib_s32 dm,
    mlib_s32 dn,
    mlib_s32 scale,
    const void *colormap)
{
	mlib_type stype, dtype;
	mlib_u8 *sl, *dl;
	mlib_u8 *lut_table;
	mlib_s32 offset, off, kw, dn1;
	mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh;
	mlib_s32 row, i, j, bsize, buff_ind = 0, func_ind, method;
	mlib_u16 *pbuff, *buff_lcl[2 * MAX_N], **buff_arr = buff_lcl, **buff;
	mlib_d64 *buffd;
	mlib_d64 kern_lcl[MAX_N * MAX_M], *kern = kern_lcl, *pkern;
	mlib_d64 dscale;
	func_dm_type func_dm;

	mlib_s32 vis_scale, kern_i;
	mlib_s32 kern_size, isum;
	mlib_d64 sum, norm;
	mlib_f32 fscale;
	mlib_s32 bit_offset;
	mlib_u8 *buff_dst;

	MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl);
	MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl);
	bit_offset = mlib_ImageGetBitOffset(dst);

	if (!(stype == MLIB_BYTE && schan == 1)) {
		return (MLIB_FAILURE);
	}
#if 0
	for (i = 0; i <= m * dn + dm; i++) {
		if (kernel[i])
			return (MLIB_FAILURE);
	}

#endif /* 0 */

	dn = n - 1 - dn;
	dm = m - 1 - dm;
	kern_size = m * dn + dm;

	if (n > MAX_N || m > MAX_M) {
		kern =
		    __mlib_malloc(n * m * sizeof (mlib_d64) +
		    2 * n * sizeof (mlib_u16 *));

		if (kern == NULL)
			return (MLIB_FAILURE);
		buff_arr = (mlib_u16 **)(kern + n * m);
	}

	dscale = 1.0;
	while (scale > 30) {
		dscale *= 1.0 / (1 << 30);
		scale -= 30;
	}

	dscale /= (1 << scale);

/* load kernel */
	kernel += m * n - 1;
	sum = 0;
	for (i = 0; i < kern_size; i++) {
		kern[i] = dscale * kernel[-i];
		sum += mlib_fabs(kern[i]);
	}

	vis_scale = mlib_ilogb(sum);

	if (vis_scale > 13)
		return (MLIB_OUTOFRANGE);
	vis_scale = 14 - vis_scale;

	if (vis_scale > 15)
		vis_scale = 15;
	norm = 32768 >> (15 - vis_scale);
	isum = 0;
	for (i = 0; i < kern_size; i++) {
		if (kern[i] > 0.0) {
			kern_i = (mlib_s32)(kern[i] * norm + 0.5);
		} else {
			kern_i = (mlib_s32)(kern[i] * norm - 0.5);
		}

		isum += abs(kern_i);
		kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff));
	}

/* recalc without rounding */
	if (isum > 32767) {
		dscale *= norm;
		for (i = 0; i < kern_size; i++) {
			kern_i = (mlib_s32)(dscale * kernel[-i]);
			kern[i] =
			    vis_to_double_dup((kern_i << 16) | (kern_i &
			    0xffff));
		}
	}

	fscale = vis_to_float(1 << (vis_scale - 1));
	vis_write_gsr(((16 - vis_scale) << 3) + 2);

	offset = mlib_ImageGetLutOffset(colormap);
	lut_table = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap);

	bsize = (sw + m) * NCHAN;
	bsize = (bsize + 7) & ~7;
	dn1 = (dn) ? dn : 1;
	pbuff =
	    __mlib_malloc((dn1 + 1) * bsize * sizeof (mlib_u16) + EXTRA_BUFF);

	if (pbuff == NULL) {
		if (kern != kern_lcl)
			__mlib_free(kern);
		return (MLIB_FAILURE);
	}

	for (j = 0; j < dn1; j++) {
		buff_arr[dn1 + j] = buff_arr[j] = pbuff + j * bsize;
	}

	buff_ind = 0;
	buffd = (mlib_d64 *)(pbuff + dn1 * bsize);
	buff_dst = (mlib_u8 *)((mlib_u16 *)buffd + bsize);

/* clear buffer */
	for (i = 0; i < dn * (bsize / 4); i++) {
		((mlib_d64 *)pbuff)[i] = 0;
	}

	func_ind = dm;

	if (func_ind > KH_MAX)
		func_ind = KH_MAX;
	method = mlib_ImageGetMethod(colormap);

	if (method == LUT_COLOR_CUBE_SEARCH)
		func_ind += KH_MAX + 1;
	else if (method == LUT_COLOR_DIMENSIONS)
		func_ind += 2 * (KH_MAX + 1);
	func_dm = func_dm_arr[func_ind];

	for (row = 0; row < sh; row++) {
		mlib_u8 *sp = sl;

		buff = buff_arr + buff_ind;

/* convert source line */
		for (i = 0; i < sw; i++) {
			mlib_d64 ss;

			ss = LD_U8(sp, i);
			ss = vis_fmul8x16al(vis_read_lo(ss), fscale);
			ST_U16(buffd, i, ss);
		}

		pkern = kern;
		for (j = 0; j < dn; j++) {
			for (off = 0; off < m; off += kw) {
				kw = m - off;

				if (kw > KW_MAX) {
					if (kw > 2 * KW_MAX)
						kw = KW_MAX;
					else
						kw = kw / 2;
				}

				func_m_arr[kw] (buffd, buff[j] + off * NCHAN,
				    pkern + off, sw);
			}

			pkern += m;
		}

#ifdef USE_COLOR2INDEXLINE
		func_dm(buff_dst, (void *)buffd, buff[dn] + dm * NCHAN, pkern,
		    colormap, lut_table, sw, dm, 0);
/*
 * mlib_ImageColorTrue2IndexLine_U8_BIT_1
 * (buff_dst, dl, bit_offset, sw, colormap);
 */
#else /* USE_COLOR2INDEXLINE */
		func_dm(dl, (void *)buffd, buff[dn] + dm * NCHAN, pkern,
		    colormap, lut_table, sw, dm, bit_offset);
#endif /* USE_COLOR2INDEXLINE */

		buff_ind++;

		if (buff_ind >= dn1)
			buff_ind -= dn1;

		sl += sll;
		dl += dll;
	}

	__mlib_free(pbuff);

	if (kern != kern_lcl)
		__mlib_free(kern);

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorARGB2JFIFYCC422(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	const mlib_u8 *argb,
	mlib_s32 n)
{
	mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y;
	mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr;
	mlib_u8 *yend = y + n, *cbend = cb + (n >> 1);
	mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37;
	mlib_d64 dh0, dh1, dl0, dl1, z0, z1;
	mlib_s32 i;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off0 = vis_to_double_dup(0x00100010);

	if (n <= 0)
		return (MLIB_FAILURE);

	vis_write_gsr(2 << 3);

	n = n >> 3;

#pragma pipeloop(0)
	for (i = 0; i < n; i++) {
		sd01 = (*sp++);
		sd23 = (*sp++);
		sd45 = (*sp++);
		sd67 = (*sp++);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		pcb[0] = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		pcr[0] = vis_fpack16(vis_fpadd16(z0, z1));

		py++;
		pcb++;
		pcr++;
	}

	if ((mlib_u8 *)pcb < cbend) {
		mlib_d64 yd;
		mlib_f32 cbf, crf;
		mlib_s32 ymask, cmask;

		sd01 = (*sp++);
		sd23 = vis_ld_d64_nf(sp); sp++;
		sd45 = vis_ld_d64_nf(sp); sp++;
		sd67 = vis_ld_d64_nf(sp);
		CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0,
			dl1);
		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k11, k12, k13, off0, z0, z1);
		z1 = vis_fpadd16(z1, off0);
		yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k21, k22, k23, off128, z0, z1);
		cbf = vis_fpack16(vis_fpadd16(z0, z1));

		CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1),
			vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1),
			vis_read_lo(dl1), k31, k32, k33, off128, z0, z1);
		crf = vis_fpack16(vis_fpadd16(z0, z1));

		ymask = vis_edge8(py, yend - 1);
		vis_pst_8(yd, py, ymask);
		cmask = vis_edge8(pcb, cbend - 1);

		if (cmask & 0xf0) {
			vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask);
			vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask);
		} else {
			vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1,
				cmask);
			vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1,
				cmask);
		}
	}
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoColorCMYK2JFIFYCCK444(
	mlib_u8 *y,
	mlib_u8 *cb,
	mlib_u8 *cr,
	mlib_u8 *k,
	const mlib_u8 *cmyk,
	mlib_s32 n)
{
	mlib_d64 buff_arr[(SIZE / 2) + 2];
	mlib_f32 *py, *pcb, *pcr, *pk;
	mlib_d64 *buff;
	mlib_d64 sdh, sdl, dr, dg, db, dd;
	mlib_s32 i, m, size, num;

	mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192));
	mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192));
	mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192));
	mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 8192));
	mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 8192));
	mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 8192));
	mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 8192));
	mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 8192));
	mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 8192));
	mlib_d64 off128 = vis_to_double_dup(0x10101010);
	mlib_d64 off255 = vis_to_double_dup(0x1ff01ff0);

	vis_write_gsr(2 << 3);

/*
 * 4-pixel loop
 */
	for (size = 0; size < n; size += num) {

		num = n - size;

		if (num > SIZE)
			num = SIZE;

		m = (num + 3) / 4;
		mlib_channel_separate((mlib_d64 *)cmyk + size / 2, buff_arr, m);

		m = (num / 4) & ~1;
		py = (mlib_f32 *)y + size / 4;
		pcb = (mlib_f32 *)cb + size / 4;
		pcr = (mlib_f32 *)cr + size / 4;
		pk = (mlib_f32 *)k + size / 4;
		buff = buff_arr;
#pragma pipeloop(0)
		for (i = 0; i < m; i++) {
			sdh = buff[0];
			sdl = buff[1];
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k11, k12, k13, off255, py[0]);
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k21, k22, k23, off128,
				pcb[0]);
			CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
				vis_read_hi(sdl), k31, k32, k33, off128,
				pcr[0]);
			py++;
			pcb++;
			pcr++;
			(*pk++) = vis_read_lo(sdl);
			buff += 2;
		}
	}

	if (n & 7) {
		mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF;
		mlib_d64 rbuff[4];
		mlib_f32 *prbuff = (mlib_f32 *)rbuff;

		sdh = (*buff++);
		sdl = (*buff++);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k11, k12, k13, off255, prbuff[0]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k21, k22, k23, off128, prbuff[2]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k31, k32, k33, off128, prbuff[4]);
		prbuff[6] = vis_read_lo(sdl);
		sdh = (*buff++);
		sdl = (*buff++);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k11, k12, k13, off255, prbuff[1]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k21, k22, k23, off128, prbuff[3]);
		CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh),
			vis_read_hi(sdl), k31, k32, k33, off128, prbuff[5]);
		prbuff[7] = vis_read_lo(sdl);

		vis_pst_8(rbuff[0], py, emask);
		vis_pst_8(rbuff[1], pcb, emask);
		vis_pst_8(rbuff[2], pcr, emask);
		vis_pst_8(rbuff[3], pk, emask);
	}
mlib_status
mlib_v_conv2x2_u16nw_mask(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u16 *da, *d_a;

/* pointers to src, dst data */
	mlib_u16 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u16 *sa, *sa1, *sa2, *sa_2;

/* pointers to rows in interm. src buf */
	mlib_u16 *buff_src, *sbuf1, *sbuf2, *prow;
	mlib_u16 *s_buf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3;

/* data */
	mlib_d64 d1, d2, d_1, d_2;

/* shifted data */
	mlib_d64 d21, d22;

/* coefficients */
	mlib_f32 k1, k2, k3, k4;
	int gsr_scale, i, j, nchannel, chan, testchan;
	mlib_u16 t1, t2, t3, t4, t5, t6, t7, t8;
	type_mlib_d64 str;
	mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000);

	nchannel = mlib_ImageGetChannels(src);
	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	gsr_scale = 32 - scalef_expon;
	vis_write_gsr((gsr_scale << 3) + 2);

/* buf_slb - 8-byte aligned */
	buf_slb = (2 * dw + 26) & (~7);
/* alloc. interm. src buffer */
	buff_src =
	    (mlib_u16 *)__mlib_malloc(2 * buf_slb * sizeof (mlib_u8) + 8);

	if (buff_src == NULL)
		return (MLIB_FAILURE);

	buf_slb >>= 1;

	sbuf1 = (mlib_u16 *)((mlib_addr)(buff_src + 8) & (~7));
	sbuf2 = sbuf1 + buf_slb;

	dw -= 1;
/* edge - no write */
	dh -= 1;

	testchan = 1;

	for (chan = nchannel - 1; chan >= 0; chan--) {
		if ((cmask & testchan) == 0) {
			testchan <<= 1;
			continue;
		}

		testchan <<= 1;
		sa = adr_src + chan;
		sa1 = sa + slb;
		sa_2 = sa2 = sa1 + slb;
		d_a = adr_dst + chan;

/* load interm. src buff */
		for (i = 0, j = 0; j < (dw + 1); i += nchannel, j++) {
			sbuf1[j] = sa1[i];
			sbuf2[j] = sa[i];
		}

		for (j = 0; j < dh - 1; j++) {
			da = d_a;
			prow = sbuf1;
			sbuf1 = sbuf2;
			sbuf2 = prow;
			s1 = (mlib_d64 *)sbuf1;
			s2 = (mlib_d64 *)sbuf2;
			dend = da + (dw - 1) * nchannel;
			s_buf1 = sbuf1;
			d1 = *s1;
			d2 = *s2;
			d1 = vis_fxor(d1, mask8000);
			d2 = vis_fxor(d2, mask8000);

			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			CONV_16(d2, k3);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			CONV_16(d21, k2);
			CONV_16(d22, k4);
			str.value =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d1 = d_1;
			d2 = d_2;
			s1++;
			s2++;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
			for (i = 0; i < dw - 4; i += 4) {
				t1 = *sa_2;
				sa_2 += nchannel;
				t2 = *sa_2;
				sa_2 += nchannel;
				d_1 = *(s1 + 1);
				d_2 = *(s2 + 1);
				d_1 = vis_fxor(d_1, mask8000);
				d_2 = vis_fxor(d_2, mask8000);
				CONV_16_BEGIN(d1, k1);
				t3 = *sa_2;
				sa_2 += nchannel;
				t4 = *sa_2;
				sa_2 += nchannel;
				CONV_16(d2, k3);
				t5 = str.forshort.ushort0;
				t6 = str.forshort.ushort1;
				d21 = vis_faligndata(d1, d_1);
				t7 = str.forshort.ushort2;
				d22 = vis_faligndata(d2, d_2);
				t8 = str.forshort.ushort3;
				CONV_16(d21, k2);
				(*s_buf1++) = t1;
				(*s_buf1++) = t2;
				CONV_16(d22, k4);
				(*s_buf1++) = t3;
				(*s_buf1++) = t4;
				*da = t5;
				da += nchannel;
				str.value =
				    vis_fxor(vis_fpackfix_pair(out0, out1),
				    mask8000);
				*da = t6;
				da += nchannel;
				d1 = d_1;
				d2 = d_2;
				*da = t7;
				da += nchannel;
				s1++;
				s2++;
				*da = t8;
				da += nchannel;
			}

			for (; i < dw + 1; i++) {
				(*s_buf1++) = *sa_2;
				sa_2 += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort0;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort1;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort2;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort3;
			}

			sa_2 = sa2 = sa2 + slb;
			d_a += dlb;
		}

/* process last row - no need to load data */
		da = d_a;
		prow = sbuf1;
		sbuf1 = sbuf2;
		sbuf2 = prow;
		s1 = (mlib_d64 *)sbuf1;
		s2 = (mlib_d64 *)sbuf2;
		dend = da + (dw - 1) * nchannel;
		d1 = *s1;
		d2 = *s2;
		d1 = vis_fxor(d1, mask8000);
		d2 = vis_fxor(d2, mask8000);

		d_1 = *(s1 + 1);
		d_2 = *(s2 + 1);
		d_1 = vis_fxor(d_1, mask8000);
		d_2 = vis_fxor(d_2, mask8000);
		CONV_16_BEGIN(d1, k1);
		CONV_16(d2, k3);
		d21 = vis_faligndata(d1, d_1);
		d22 = vis_faligndata(d2, d_2);
		CONV_16(d21, k2);
		CONV_16(d22, k4);
		d1 = d_1;
		d2 = d_2;
		s1++;
		s2++;

#pragma pipeloop(0)
		for (i = 4; i < dw; i += 4) {
			str.value =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			t5 = str.forshort.ushort0;
			CONV_16(d2, k3);
			d21 = vis_faligndata(d1, d_1);
			t6 = str.forshort.ushort1;
			d22 = vis_faligndata(d2, d_2);
			CONV_16(d21, k2);
			t7 = str.forshort.ushort2;
			CONV_16(d22, k4);
			t8 = str.forshort.ushort3;
			*da = t5;
			da += nchannel;
			*da = t6;
			da += nchannel;
			*da = t7;
			da += nchannel;
			d1 = d_1;
			d2 = d_2;
			*da = t8;
			da += nchannel;
			s1++;
			s2++;
		}

		str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort0;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort1;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort2;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort3;
		}
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
static mlib_status
mlib_v_VideoColorYUV2RGB411_nonalign(
    mlib_u8 *rgb,
    const mlib_u8 *y,
    const mlib_u8 *u,
    const mlib_u8 *v,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 rgb_stride,
    mlib_s32 y_stride,
    mlib_s32 uv_stride)
{
    /* pointers to src address */
    mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3;

    /* pointers to dst address */
    mlib_u8 *dp, *dl;

    /* all. pointer to y */
    mlib_d64 *spy;

    /* all. pointers to u, v */
    mlib_d64 *dfu, *dfv;

    /* u, v data */
    mlib_f32 fu, fv;

    /* y data */
    mlib_d64 dy0, dy1, dy2, dy3;
    mlib_d64 ddy1, ddy2, ddy3, ddy4;
    mlib_d64 du0, du1, fu0, fu1;
    mlib_d64 dv1, dv2, fv0, fv1;
    mlib_d64 dr, dr1, dr2, dr3, dr4;
    mlib_d64 dg, dg1, dg2, dg3, dg4;
    mlib_d64 db, db1, db2, db3, db4;
    mlib_d64 dtmp;

    /* 1.1644  * 4096 */
    mlib_f32 f0 = vis_to_float(0x12a1);

    /* 2.0184  * 8192 */
    mlib_f32 f1 = vis_to_float(0x4097);

    /* -0.3920 * 8192 */
    mlib_f32 f4 = vis_to_float(0xf375);

    /* -0.8132 * 8192 */
    mlib_f32 f5 = vis_to_float(0xe5fa);

    /* 1.5966  * 8192 */
    mlib_f32 f8 = vis_to_float(0x3317);

    /* -276.9856 * 32 */
    mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60);

    /* 135.6352  * 32 */
    mlib_d64 doff1 = vis_to_double_dup(0x10f410f4);

    /* -222.9952 * 32 */
    mlib_d64 doff2 = vis_to_double_dup(0xe420e420);
    mlib_f32 fscale = vis_to_float(0x80808080);

    /* loop variable */
    mlib_s32 i, j;
    mlib_d64 *buf, BUFF[16 * 1024];
    mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22;
    mlib_u8 *tmp;

    if (width * 3 > 16 * 1024) {
        tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7);
        buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7);
    } else {
        buf = (mlib_d64 *)BUFF;
    }

    /*
     * initialize GSR scale factor
     */
    vis_write_gsr(3 << 3);

    sp1 = sl1 = (mlib_u8 *)y;
    sp2 = sl2 = (mlib_u8 *)u;
    sp3 = sl3 = (mlib_u8 *)v;

    dp = (mlib_u8 *)buf;
    dl = rgb;
    ddp = (mlib_d64 *)dp;

    /*
     * row loop
     */
    for (j = 0; j < height; j++) {
        spy = (mlib_d64 *)vis_alignaddr(sp1, 0);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = (*dfu++);
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = (*dfv++);
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        dy0 = (*spy++);
        dy3 = vis_ld_d64_nf(spy);
        spy++;
        vis_alignaddr(sp1, 0);
        dy1 = vis_faligndata(dy0, dy3);
        dy0 = vis_ld_d64_nf(spy);
        spy++;
        dy2 = vis_faligndata(dy3, dy0);

        du0 = vis_fmul8x16al(fu, f1);
        db = vis_fpadd16(du0, doff0);

        du1 = vis_fmul8x16al(fu, f4);
        dv1 = vis_fmul8x16al(fv, f5);
        dtmp = vis_fpadd16(du1, dv1);
        dg = vis_fpadd16(dtmp, doff1);

        dv2 = vis_fmul8x16al(fv, f8);
        dr = vis_fpadd16(dv2, doff2);

        ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
        ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

        ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
        ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

        db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
        db1 = vis_fpadd16(ddy1, db1);

        db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
        db2 = vis_fpadd16(ddy2, db2);

        db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
        db3 = vis_fpadd16(ddy3, db3);

        db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
        db4 = vis_fpadd16(ddy4, db4);

        dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
        dg1 = vis_fpadd16(ddy1, dg1);

        dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
        dg2 = vis_fpadd16(ddy2, dg2);

        dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
        dg3 = vis_fpadd16(ddy3, dg3);

        dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
        dg4 = vis_fpadd16(ddy4, dg4);

        dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
        dr1 = vis_fpadd16(ddy1, dr1);

        dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
        dr2 = vis_fpadd16(ddy2, dr2);

        dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
        dr3 = vis_fpadd16(ddy3, dr3);

        dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
        dr4 = vis_fpadd16(ddy4, dr4);

        db = vis_fpack16_pair(db1, db2);
        db1 = vis_fpack16_pair(db3, db4);

        dr = vis_fpack16_pair(dr1, dr2);
        dr1 = vis_fpack16_pair(dr3, dr4);

        dg = vis_fpack16_pair(dg1, dg2);
        dg1 = vis_fpack16_pair(dg3, dg4);

        dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
        fu0 = vis_ld_d64_nf(dfu);
        dfu++;
        fu1 = vis_ld_d64_nf(dfu);
        dfu++;
        fu = vis_read_hi(vis_faligndata(fu0, fu1));
        sp2 += 4;

        dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
        fv0 = vis_ld_d64_nf(dfv);
        dfv++;
        fv1 = vis_ld_d64_nf(dfv);
        dfv++;
        fv = vis_read_hi(vis_faligndata(fv0, fv1));
        sp3 += 4;

        /*
         * 16-pixel column loop
         */
#pragma pipeloop(0)
        for (i = 0; i <= width - 16; i += 16) {

            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            dd02 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            dd12 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            dd22 = vis_bshuffle(dr1, dg1);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            ddp[3] = vis_bshuffle(dd02, db1);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            ddp[4] = vis_bshuffle(dd12, db1);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            ddp[5] = vis_bshuffle(dd22, db1);

            dy3 = vis_ld_d64_nf(spy);
            spy++;
            vis_alignaddr(sp1, 0);
            dy1 = vis_faligndata(dy0, dy3);
            dy0 = vis_ld_d64_nf(spy);
            spy++;
            dy2 = vis_faligndata(dy3, dy0);

            du0 = vis_fmul8x16al(fu, f1);
            db = vis_fpadd16(du0, doff0);

            du1 = vis_fmul8x16al(fu, f4);
            dv1 = vis_fmul8x16al(fv, f5);
            dtmp = vis_fpadd16(du1, dv1);
            dg = vis_fpadd16(dtmp, doff1);

            dv2 = vis_fmul8x16al(fv, f8);
            dr = vis_fpadd16(dv2, doff2);

            ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0);
            ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0);

            ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0);
            ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0);

            db1 = vis_fmul8x16au(fscale, vis_read_hi(db));
            db1 = vis_fpadd16(ddy1, db1);

            db2 = vis_fmul8x16al(fscale, vis_read_hi(db));
            db2 = vis_fpadd16(ddy2, db2);

            db3 = vis_fmul8x16au(fscale, vis_read_lo(db));
            db3 = vis_fpadd16(ddy3, db3);

            db4 = vis_fmul8x16al(fscale, vis_read_lo(db));
            db4 = vis_fpadd16(ddy4, db4);

            dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg));
            dg1 = vis_fpadd16(ddy1, dg1);

            dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg));
            dg2 = vis_fpadd16(ddy2, dg2);

            dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg));
            dg3 = vis_fpadd16(ddy3, dg3);

            dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg));
            dg4 = vis_fpadd16(ddy4, dg4);

            dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr));
            dr1 = vis_fpadd16(ddy1, dr1);

            dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr));
            dr2 = vis_fpadd16(ddy2, dr2);

            dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr));
            dr3 = vis_fpadd16(ddy3, dr3);

            dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr));
            dr4 = vis_fpadd16(ddy4, dr4);

            db = vis_fpack16_pair(db1, db2);
            db1 = vis_fpack16_pair(db3, db4);

            dr = vis_fpack16_pair(dr1, dr2);
            dr1 = vis_fpack16_pair(dr3, dr4);

            dg = vis_fpack16_pair(dg1, dg2);
            dg1 = vis_fpack16_pair(dg3, dg4);

            dfu = (mlib_d64 *)vis_alignaddr(sp2, 0);
            fu0 = vis_ld_d64_nf(dfu);
            dfu++;
            fu1 = vis_ld_d64_nf(dfu);
            dfu++;
            fu = vis_read_hi(vis_faligndata(fu0, fu1));
            sp2 += 4;

            dfv = (mlib_d64 *)vis_alignaddr(sp3, 0);
            fv0 = vis_ld_d64_nf(dfv);
            dfv++;
            fv1 = vis_ld_d64_nf(dfv);
            dfv++;
            fv = vis_read_hi(vis_faligndata(fv0, fv1));
            sp3 += 4;

            ddp += 6;
        }

        if (i <= width - 8) {
            vis_write_bmask(0x0801902A, 0);
            dd01 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x03B04C05, 0);
            dd11 = vis_bshuffle(dr, dg);
            vis_write_bmask(0xD06E07F0, 0);
            dd21 = vis_bshuffle(dr, dg);
            vis_write_bmask(0x01834967, 0);
            ddp[0] = vis_bshuffle(dd01, db);
            vis_write_bmask(0xA12B45C7, 0);
            ddp[1] = vis_bshuffle(dd11, db);
            vis_write_bmask(0x0D23E56F, 0);
            ddp[2] = vis_bshuffle(dd21, db);
            db = db1;
            dr = dr1;
            dg = dg1;
            ddp += 3;
            i += 8;
        }

        dp = (mlib_u8 *)ddp;

        vis_alignaddr((void *)(width - i), 0);
        db = vis_faligndata(db, db);
        dg = vis_faligndata(dg, dg);
        dr = vis_faligndata(dr, dr);
        dp += ((width - i - 1) * 3);

        vis_alignaddr((void *)7, 0);
        for (; i < width; i++) {
            STORE_PIXEL(0, 1, 2);
            dp -= 3;
        }

        sp1 = sl1 = sl1 + y_stride;
        sp2 = sl2 = sl2 + uv_stride;
        sp3 = sl3 = sl3 + uv_stride;
        __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3);

        dl = dp = dl + rgb_stride;
        dp = (mlib_u8 *)buf;
        ddp = (mlib_d64 *)dp;
    }

    if (width * 3 > 16 * 1024)
        __mlib_free(tmp);

    return (MLIB_SUCCESS);
}
示例#27
0
mlib_status
__mlib_VectorConvert_S32_S8_Mod(
	mlib_s32 *z,
	const mlib_s8 *x,
	mlib_s32 n)
{
	mlib_s8 *psrc = (mlib_s8 *)x;
	mlib_s32 *pdst = (mlib_s32 *)z;
	mlib_f32 fone = vis_to_float(0x10001);
	mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done =
		vis_to_double_dup(0x1000100);
	mlib_s32 i = 0;

	if (n <= 0)
		return (MLIB_FAILURE);

	if ((mlib_addr)pdst & 7) {
		(*pdst++) = (*psrc++);
		i = 1;
	}

	dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0);
	dsrc = vis_ld_d64_nf(dpsrc);
	vis_write_bmask(0x00012223, 0);

	if ((mlib_addr)psrc & 7) {
		dsrc1 = vis_ld_d64_nf(dpsrc + 1);
		dsrc = vis_faligndata(dsrc, dsrc1);
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3));
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc0 = dsrc1;
			dsrc1 = vis_ld_d64_nf(dpsrc + 2);
			dsrc = vis_faligndata(dsrc0, dsrc1);

			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	} else {
#pragma pipeloop(1)
#pragma unroll(1)
		for (; i <= (n - 8); i += 8) {
			dst1 = vis_fpmerge(vis_read_hi(dsrc),
				vis_read_hi(dsrc));
			dst1 = vis_fmul8sux16(dst1, done);
			dst0 = vis_bshuffle(dst1, dst1);
			dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1));
			dst3 = vis_fpmerge(vis_read_lo(dsrc),
				vis_read_lo(dsrc));
			dst3 = vis_fmul8sux16(dst3, done);
			dst2 = vis_bshuffle(dst3, dst3);
			dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3));

			dsrc = vis_ld_d64_nf(dpsrc + 1);
			((mlib_d64 *)pdst)[0] = dst0;
			((mlib_d64 *)pdst)[1] = dst1;
			((mlib_d64 *)pdst)[2] = dst2;
			((mlib_d64 *)pdst)[3] = dst3;
			pdst += 8;
			psrc += 8;
			dpsrc++;
		}
	}

	for (; i < n; i++)
		(*pdst++) = (*psrc++);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv2x2_u16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u16 *da, *d_a;

/* pointers to src, dst data */
	mlib_u16 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u16 *sa, *sa1;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2;
	mlib_f32 k1, k2, k3, k4;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	int gsr_scale, i, j;
	mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	gsr_scale = 32 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));

	buf_slb = (8 * dw + 16) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 1;
	dw *= 4;
	dh -= 1;

	sa = adr_src;
	sa1 = sa + slb;
	d_a = adr_dst;

/* load interm. src buff */
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(sbuf2, sa, 4);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(sbuf2, sa1, 4);

		d1 = *s1;
		d2 = *s2;
		d1 = vis_fxor(d1, mask8000);
		d2 = vis_fxor(d2, mask8000);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 4) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			CONV_16(d2, k3);
			CONV_16(d_1, k2);
			CONV_16(d_2, k4);
			(*ddst++) =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d1 = d_1;
			d2 = d_2;
			s1++;
			s2++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa1 = sa1 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
示例#29
0
mlib_status
mlib_ImageMulAlpha_U8(
    mlib_u8 *sl,
    mlib_u8 *dl,
    mlib_s32 sstride,
    mlib_s32 dstride,
    mlib_s32 width,
    mlib_s32 height,
    mlib_s32 channel,
    mlib_s32 alpha)
{
	mlib_f32 fzeros = vis_fzeros();
	mlib_d64 dmask = vis_to_double_dup(0x00FF00FF);
	mlib_d64 done = vis_to_double_dup(0x01000100);
	mlib_d64 *buffs, *buffd;
	mlib_d64 *sp, *dp;
	mlib_d64 ss, s1, rr, d0, d1;
	mlib_d64 amask0, amask1, amask2;
	mlib_s32 ww, dflag, cmask, i, j;

	vis_write_gsr(7 << 3);

	width *= channel;
	ww = (width + 7) / 8;

	if (channel == 3) {
		ww = 3 * ((ww + 2) / 3);
	}

	buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww);

	if (buffs == NULL) {
		return (MLIB_FAILURE);
	}

	buffd = buffs + ww;

	if (channel == 4) {
		cmask = 1 << (3 - alpha);
		cmask |= (cmask << 4);
	} else if (channel == 3) {
		amask0 = ((mlib_d64 *)mlib_amask3_arr)[alpha];
		amask1 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 1];
		amask2 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 2];
	}

	for (j = 0; j < height; j++) {
		if (((int)sl & 7)) {
			MEM_COPY(sl, buffs, width);
			sp = buffs;
		} else {
			sp = (mlib_d64 *)sl;
		}

		dflag = 0;

		if (((int)dl | width) & 7) {
			dp = buffd;
			dflag = 1;
		} else {
			dp = (mlib_d64 *)dl;
		}

		if (channel == 4) {
			mlib_d64 a0, a1;

			if (alpha == 0) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(hi, au);
				}

			} else if (alpha == 1) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(hi, al);
				}

			} else if (alpha == 2) {
#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(lo, au);
				}

			} else {	/* if (alpha == 3) */

#pragma pipeloop(0)
				for (i = 0; i < ww; i++) {
					MUL_ALPHA_4CH(lo, al);
				}
			}

		} else if (channel == 3) {
			mlib_d64 s0, s1, s2;
			mlib_d64 a0, a1, a2;
			mlib_s32 cmask0, cmask1, cmask2;

			cmask0 = 0x492 >> alpha;
			cmask1 = 0x492 >> (alpha + 1);
			cmask2 = 0x492 >> (alpha + 2);

			if (alpha == 0) {
				vis_alignaddr((void *)0, 7);
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_0();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_0_NF();
					MUL_ALPHA_3CH();
				}

			} else if (alpha == 1) {
				mlib_d64 b0, b1, b2;

#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_1();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_1_NF();
					MUL_ALPHA_3CH();
				}

			} else {	/* if (alpha == 2) */

				vis_alignaddr((void *)0, 1);
#pragma pipeloop(0)
				for (i = 0; i < ww - 3; i += 3) {
					LOAD_3CH_2();
					MUL_ALPHA_3CH();
				}

				if (i < ww) {
					LOAD_3CH_2_NF();
					MUL_ALPHA_3CH();
				}

			}

		} else {	/* if (channel == 2) */

			if (alpha == 0) {
示例#30
0
#endif /* ! defined(__MEDIALIB_OLD_NAMES) */

/* *********************************************************** */

mlib_status
__mlib_VideoP64Decimate_U8_U8(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 dst_stride,
	mlib_s32 src_stride)
{
	mlib_s32 x, y, x4 = width >> 2;
	mlib_d64 *sl1, *sl2, s1hi, s1lo, s2hi, s2lo, s1, s2;
	mlib_d64 done = vis_to_double_dup(0x1000100);
	mlib_d64 dmask;
	mlib_f32 *dp;
	mlib_f32 frnd = vis_to_float(0x40404040);
	mlib_s32 src_stride2 = 2 * src_stride;

	dmask = vis_fpadd16(done, vis_fone());
	vis_write_gsr(7 << 3);
	sl1 = (mlib_d64 *)src;
	sl2 = (mlib_d64 *)(src + src_stride);
	dp = (mlib_f32 *)dst;

	for (y = 0; y < height; y++) {
#pragma pipeloop(0)
		for (x = 0; x < x4; x++) {
			s1 = sl1[x];