void mlib_v_ImageSquare_U16_A8D2X4( mlib_u16 *src, mlib_s32 slb, mlib_u16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { /* pointer to source image */ mlib_d64 *sp; /* pointer to a line in source */ mlib_d64 *sl; /* pointer to destination image */ mlib_d64 *dp; /* pointer to a line in destination */ mlib_d64 *dl; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, j; mlib_d64 xor_mask = vis_to_double_dup(0x80008000); mlib_d64 sat_offset = vis_to_double_dup(0x40004000); sl = sp = (mlib_d64 *)src; dl = dp = (mlib_d64 *)dst; /* row loop */ for (j = 0; j < ysize; j++) { /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < (xsize / 4); i++) { sd = (*sp++); MLIB_V_IMAGESQUARE_U16(sd, dd); (*dp++) = dd; } sl = sp = (mlib_d64 *)((mlib_u8 *)sl + slb); dl = dp = (mlib_d64 *)((mlib_u8 *)dl + dlb); } }
void mlib_v_ImageSqrShift_U16_A8D1X4( mlib_u16 *src, mlib_u16 *dst, mlib_s32 dsize, mlib_s32 shift) { /* pointer to source images */ mlib_d64 *sp; /* pointer to destination image */ mlib_d64 *dp; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdhh, rdhl; /* temporaries used in macro */ mlib_d64 rdlh, rdll; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 j; mlib_d64 mask = vis_to_double_dup(0xfffefffe); mlib_f32 fmin = vis_to_float(0x80808080); mlib_d64 negate = vis_to_double_dup(0x7FFF7FFF); mlib_d64 xor_mask = vis_to_double_dup(0x80008000); mlib_d64 sat_offset = vis_to_double_dup(0x20000000 >> (16 - shift)); sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; /* 4-pixel loop */ #pragma pipeloop(0) for (j = 0; j < (dsize / 4); j++) { sd = (*sp++); MLIB_V_IMAGESQRSHIFT_U16(sd, dd); (*dp++) = dd; } }
void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, dmask, dFF; mlib_s32 i, i0, j, x, mask; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } dmask = vis_to_double_dup(0xFFFFFF); dFF = vis_to_double_dup(0xFFFFFFFF); for (j = 0; j < height; j++) { mlib_s32 *src = srcBase; mlib_s32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { x = src[i]; dst[i] = (x << 7) >> 7; i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { mlib_u8 *pp0 = (mlib_u8*)(src + i); mlib_u8 *pp1 = (mlib_u8*)(src + i + 1); dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1); dd = vis_fand(dd, dmask); #if 1 mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3); *(mlib_d64*)(dst + i) = dd; vis_pst_8(dFF, dst + i, mask); #else mask = ((*pp0 & 1) << 1) | (*pp1 & 1); dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]); *(mlib_d64*)(dst + i) = dd; #endif } if (i < width) { x = src[i]; dst[i] = (x << 7) >> 7; }
void mlib_v_ImageClear_BIT_1(mlib_image *img, const mlib_s32 *color) { mlib_u8 *pimg = (mlib_u8 *) mlib_ImageGetData(img); mlib_s32 img_height = mlib_ImageGetHeight(img); mlib_s32 img_width = mlib_ImageGetWidth(img); mlib_s32 img_stride = mlib_ImageGetStride(img); mlib_s32 img_bitoff = mlib_ImageGetBitOffset(img); mlib_s32 i, j, b_j, k; mlib_u8 bcolor0, bmask, emask, src; mlib_d64 dcolor, *dpimg; mlib_u32 color0; if (img_width == img_stride * 8) { img_width *= img_height; img_height = 1; } color0 = ((color[0] & 1) << 31) >> 31; bcolor0 = color0 & 0xFF; dcolor = vis_to_double_dup(color0); for (i = 0, j = 0; i < img_height; i++) { mlib_u8 *pimg_row = pimg + i * img_stride, *pimg_row_end; if (img_bitoff + img_width <= 8) { bmask = (0xFF >> (8 - img_width)) << (8 - img_bitoff - img_width); src = pimg_row[0]; pimg_row[0] = (src & ~bmask) | (color0 & bmask); continue; } else {
void mlib_ImageLineXor8000( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 size) { mlib_u8 *dend; mlib_d64 *dptr; mlib_d64 *sptr; mlib_d64 s0, s1; mlib_d64 mask8000 = vis_to_double_dup(0x80008000); mlib_s32 j; mlib_s32 emask; /* prepare the destination addresses */ dptr = (mlib_d64 *)((mlib_addr)dst & (~7)); j = (mlib_addr)dptr - (mlib_addr)dst; dend = (mlib_u8 *)dst + size - 1; /* prepare the source address */ sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j); /* generate edge mask for the start point */ emask = vis_edge8(dst, dend); s1 = vis_ld_d64_nf(sptr); if (emask != 0xff) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); s0 = vis_fxor(vis_faligndata(s0, s1), mask8000); vis_pst_8(s0, dptr++, emask); sptr++; j += 8; } #pragma pipeloop(0) for (; j <= (size - 16); j += 8) { s0 = s1; s1 = sptr[1]; (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; } if (j <= (size - 8)) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; j += 8; } if (j < size) { s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)), mask8000); emask = vis_edge8(dptr, dend); vis_pst_8(s0, dptr, emask); } }
void mlib_v_ImageAffineTableLine_8nw_3_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; mlib_d64 yFilter2; mlib_d64 yFilter3; mlib_d64 row20, row30; mlib_d64 *dpSrc; mlib_d64 data0, data1, zero; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; zero = vis_to_double_dup(0); #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_3x2; FILTER_MERGE_4x2; MAKE_4x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, yFilter); xFilter = vis_write_hi(xFilter, fx0); LOAD_PIXEL_3; v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); sum = vis_fpadd16(v0, sum); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
static void mlib_VolumeWindowLevel1( mlib_u8 *dst, const mlib_s16 *src, mlib_s32 window, mlib_s32 level, mlib_s32 gmax, mlib_s32 gmin, mlib_s32 len) { INIT_VARS; mlib_s32 ia4[1]; mlib_f32 A; while (2 * a < 1 && scale < 7) { a *= 2; scale++; } vis_write_gsr((7 - scale) << 3); ia = a * 256.0 + 0.5; if (ia > MLIB_U8_MAX) ia = MLIB_U8_MAX; ia4[0] = (ia << 24) | (ia << 16) | (ia << 8) | ia; A = *(mlib_f32 *)ia4; dgmin = vis_to_double_dup(((gmin << 16) | gmin) << scale); if (window >= (1 << 15)) { dwin = 0; ia = ((gmax + gmin) << scale) * 0.5 - level * a; dgmin = vis_to_double_dup((ia << 16) | (ia & 0xFFFF)); } PRE_LOOP(MLIB_CALC1); #pragma pipeloop(0) MAIN_LOOP(MLIB_CALC1); END_LOOP(MLIB_CALC1); }
DEF_FUNC(mlib_ImageBlendColor_U8, mlib_u8, mlib_s32) { mlib_f32 fzeros = vis_fzeros(); mlib_f32 fmax = vis_to_float(0xFFFFFFFF); mlib_d64 dmask = vis_to_double_dup(0x00FF00FF); mlib_d64 done = vis_to_double_dup(0x01000100); mlib_d64 *buffs, *buffd; mlib_d64 *sp, *dp; mlib_f32 *alp_tbl; mlib_d64 ss, s1, rr, tt, d0, d1; mlib_d64 cc, c0, c1, c2; mlib_d64 amask0, amask1, amask2; mlib_s32 ww, dflag, i, j; vis_write_gsr(7 << 3); width *= channel; ww = (width + 7) / 8; if (channel == 3) { ww = 3 * ((ww + 2) / 3); } buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww); if (buffs == NULL) { return (MLIB_FAILURE); } buffd = buffs + ww; if (channel == 4) { cc = DOUBLE_4U16(color[0], color[1], color[2], color[3]); cc = vis_fand(vis_for(cc, ((mlib_d64 *)mlib_dmask_arr)[8 >> alpha]), dmask); alp_tbl = (mlib_f32 *)mlib_alp_tbl + alpha * 256; } else if (channel == 3) {
void mlib_v_ImageSquare_U16_A8D1X4( mlib_u16 *src, mlib_u16 *dst, mlib_s32 dsize) { /* pointer to source image */ mlib_d64 *sp; /* pointer to destination image */ mlib_d64 *dp; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 j; mlib_d64 xor_mask = vis_to_double_dup(0x80008000); mlib_d64 sat_offset = vis_to_double_dup(0x40004000); sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; /* 4-pixel loop */ #pragma pipeloop(0) for (j = 0; j < (dsize / 4); j++) { sd = (*sp++); MLIB_V_IMAGESQUARE_U16(sd, dd); (*dp++) = dd; } }
static void mlib_VolumeWindowLevel2( mlib_u8 *dst, const mlib_s16 *src, mlib_s32 window, mlib_s32 level, mlib_s32 gmax, mlib_s32 gmin, mlib_s32 len) { INIT_VARS; mlib_d64 A; while (2 * a < (1 << 7) && scale < 7) { a *= 2; scale++; } vis_write_gsr((7 - scale) << 3); WRITE_BMASK(0x13579BDF); ia = a * 256.0 + 0.5; if (ia > MLIB_S16_MAX) ia = MLIB_S16_MAX; A = vis_to_double_dup((ia << 16) | (ia & 0xFFFF)); dgmin = vis_to_double_dup(((gmin << 16) | gmin) << scale); PRE_LOOP(MLIB_CALC2); #pragma pipeloop(0) MAIN_LOOP(MLIB_CALC2); END_LOOP(MLIB_CALC2); }
void mlib_v_ImageSqrSmallShift_U16_A8D1X4( mlib_u16 *src, mlib_u16 *dst, mlib_s32 dsize, mlib_s32 shift) { /* pointer to source images */ mlib_d64 *sp; /* pointer to destination image */ mlib_d64 *dp; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdhh, rdhl; /* temporaries used in macro */ mlib_d64 rdlh, rdll; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 j; mlib_d64 sdad, rdh_0, rdh_1, rdl_0, rdl_1; mlib_d64 offset = vis_to_double_dup(0x80008000); mlib_d64 dscale = (mlib_d64)(0x10000 >> shift); mlib_d64 sat_offset = (mlib_d64)(0x40000000) * dscale - ((mlib_d64)0x80000000); sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; /* 4-pixel loop */ #pragma pipeloop(0) for (j = 0; j < (dsize / 4); j++) { sd = (*sp++); MLIB_V_IMAGESQRSMALLSHIFT_U16(sd, dd); (*dp++) = dd; } }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_f32 *pd = (mlib_f32 *)dst; mlib_d64 d0; mlib_d64 tmp0, tmp1, data; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, bias = 0; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { d0 = (*sp0++); tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0)); tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0)); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16(data); } dst = (mlib_u8 *)pd; for (; i < n; i += 2) { (*dst++) = (src[i] + src[i + 1] + bias) >> 1; /* 1=>2, 2=>1 */ bias ^= 1; } return (MLIB_SUCCESS); }
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, mask; mlib_s32 i, i0, j; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } mask = vis_to_double_dup(0xFF000000); vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u32 *src = srcBase; mlib_u32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { dst[i] = 0xff000000 | (src[i] >> 8); i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]); dd = vis_faligndata(dd, dd); *(mlib_d64*)(dst + i) = vis_for(dd, mask); } if (i < width) { dst[i] = 0xff000000 | (src[i] >> 8); }
mlib_status __mlib_VectorConvert_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s32 i; const mlib_s8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 fzero = vis_fzeros(); mlib_s32 len_64, even_length, rest_64, length = n, off; mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7; if (length < 16) { EXPAND(mlib_s8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; off = (mlib_addr)src & 7; if (!off) { /* * Both vectors are 64-bit aligned. */ /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2), fzero), four_16_ones); } } else { /* * Source vector is not 64-bit aligned. * Peeling of 1 iteration. Then loop with step==2. */ vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd4 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i <= len_64; i += 2) { dd0 = dd2; dd1 = vis_ld_d64_nf(ddsrc + i); dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_bshuffle(dd0, dd1); dd6 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); dd7 = vis_faligndata(dd6, dd6); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); (*ddst++) = vis_fmul8sux16(dd6, four_16_ones); (*ddst++) = vis_fmul8sux16(dd7, four_16_ones); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2RGB444_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 yuv_stride) { /* all. pointer to y, u, v */ mlib_d64 *spy, *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0); mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4); mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2; /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_u8 *tmp, *dp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); if (tmp == NULL) return (MLIB_FAILURE); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; for (j = 0; j < height; j++) { dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0); du0 = (*dfu++); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0); dv0 = (*dfv++); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); spy = (mlib_d64 *)vis_alignaddr((void *)y, 0); dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd0 = vis_bshuffle(red, green); vis_write_bmask(0x03B04C05, 0); dd1 = vis_bshuffle(red, green); vis_write_bmask(0xD06E07F0, 0); dd2 = vis_bshuffle(red, green); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd0, blue); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd1, blue); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd2, blue); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; ddp += 3; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); blue = vis_faligndata(blue, blue); green = vis_faligndata(green, green); red = vis_faligndata(red, red); dp += ((width - i - 1) * 3); vis_alignaddr((void *)spy, 7); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } __mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3); rgb += rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; y += yuv_stride; u += yuv_stride; v += yuv_stride; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_d64 *pd = (mlib_d64 *)dst; mlib_d64 d0; mlib_d64 tmp, data0, data1; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, edge; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); vis_write_bmask(0x02461357, 0); #pragma pipeloop(0) for (i = 0; i <= n - 16; i += 16) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16_pair(data0, data1); } if (i < n) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = vis_ld_d64_nf(sp0); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); edge = vis_edge8(pd, (dst + (n / 2) - 1)); vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
void mlib_v_ImageMulSmallShift_U16( mlib_s16 *sp1, mlib_s32 stride1, mlib_s16 *sp2, mlib_s32 stride2, mlib_s16 *dp, mlib_s32 strided, mlib_s32 width, mlib_s32 height, mlib_s32 shift) { /* pointers for line of source1 */ mlib_s16 *sl1; /* pointers for line of source2 */ mlib_s16 *sl2; /* pointers for line of dst */ mlib_s16 *dl; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; mlib_d64 rdhh, rdhl; mlib_d64 rdlh, rdll; mlib_d64 rdh, rdl; mlib_s32 i, j, k; mlib_d64 sd1, sd2, sd1ad, sd2ad, rdh_0, rdh_1, rdl_0, rdl_1; mlib_d64 offset = vis_to_double_dup(0x80008000); mlib_d64 half_offset = vis_to_double(0x40004000, 0x80008000); mlib_d64 const_offset = vis_to_double_dup(0x20000000); const_offset = vis_fpsub32(const_offset, vis_to_double_dup(0x40000000 >> (16 - shift))); if (width == stride1 && width == stride2 && width == strided) { width *= height; height = 1; } /* initialize GSR scale factor */ vis_write_gsr(((16 - (shift - 1)) & 0x1f) << 3); sl1 = sp1; sl2 = sp2; dl = dp; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); (*dpp++) = dd; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = spp2[0]; if (emask != 0xf) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = spp1[0]; if (emask != 0xf) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, spp1[1]); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { /* printf("4:\n"); */ for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(spp1[0], spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(spp2[0], spp2[1]); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = spp1[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (width - 4); k += 4) { sd10 = sd11; sd11 = spp1[1]; (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = spp2[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFTONE_U16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFTONE_U16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } }
mlib_status __mlib_VectorConjRev_S8C_S8C_Sat( mlib_s8 *zz, const mlib_s8 *xx, mlib_s32 n) { const mlib_s8 *x = xx; mlib_s8 *z = zz; mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dcntr0 = vis_to_double_dup(0x00800080); mlib_d64 dxor0 = vis_to_double_dup(0x007f007f); mlib_d64 done = vis_to_double_dup(1); mlib_s8 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; mlib_f32 f_null = vis_to_float(0); CHECK(x, z); if (n < 8) { CONJREVC(mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S8_MIN) *--dst = MLIB_S8_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(7 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 1); d_rest = vis_to_double((re_part << 24), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_8(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } vis_write_gsr(1); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_8(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S8_MIN) dst[-even_length - 2 - i + 1] = MLIB_S8_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConjRev_S16C_S16C_Sat( mlib_s16 *zz, const mlib_s16 *xx, mlib_s32 n) { mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz; mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n]; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo = vis_to_double(0, 2); mlib_f32 f_two = vis_to_float(0x20002); mlib_s16 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; CHECK(x, z); if ((n < 16)) { CONJREVC(mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S16_MIN) *--dst = MLIB_S16_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(15 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 2); d_rest = vis_to_double((re_part << 16), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_16(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } vis_write_gsr(2); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_16(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S16_MIN) dst[-even_length - 2 - i + 1] = MLIB_S16_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
mlib_status FUNC( MxN) ( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; mlib_u8 *sl, *dl; mlib_u8 *lut_table; mlib_s32 offset, off, kw, dn1; mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh; mlib_s32 row, i, j, bsize, buff_ind = 0, func_ind, method; mlib_u16 *pbuff, *buff_lcl[2 * MAX_N], **buff_arr = buff_lcl, **buff; mlib_d64 *buffd; mlib_d64 kern_lcl[MAX_N * MAX_M], *kern = kern_lcl, *pkern; mlib_d64 dscale; func_dm_type func_dm; mlib_s32 vis_scale, kern_i; mlib_s32 kern_size, isum; mlib_d64 sum, norm; mlib_f32 fscale; mlib_s32 bit_offset; mlib_u8 *buff_dst; MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl); bit_offset = mlib_ImageGetBitOffset(dst); if (!(stype == MLIB_BYTE && schan == 1)) { return (MLIB_FAILURE); } #if 0 for (i = 0; i <= m * dn + dm; i++) { if (kernel[i]) return (MLIB_FAILURE); } #endif /* 0 */ dn = n - 1 - dn; dm = m - 1 - dm; kern_size = m * dn + dm; if (n > MAX_N || m > MAX_M) { kern = __mlib_malloc(n * m * sizeof (mlib_d64) + 2 * n * sizeof (mlib_u16 *)); if (kern == NULL) return (MLIB_FAILURE); buff_arr = (mlib_u16 **)(kern + n * m); } dscale = 1.0; while (scale > 30) { dscale *= 1.0 / (1 << 30); scale -= 30; } dscale /= (1 << scale); /* load kernel */ kernel += m * n - 1; sum = 0; for (i = 0; i < kern_size; i++) { kern[i] = dscale * kernel[-i]; sum += mlib_fabs(kern[i]); } vis_scale = mlib_ilogb(sum); if (vis_scale > 13) return (MLIB_OUTOFRANGE); vis_scale = 14 - vis_scale; if (vis_scale > 15) vis_scale = 15; norm = 32768 >> (15 - vis_scale); isum = 0; for (i = 0; i < kern_size; i++) { if (kern[i] > 0.0) { kern_i = (mlib_s32)(kern[i] * norm + 0.5); } else { kern_i = (mlib_s32)(kern[i] * norm - 0.5); } isum += abs(kern_i); kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff)); } /* recalc without rounding */ if (isum > 32767) { dscale *= norm; for (i = 0; i < kern_size; i++) { kern_i = (mlib_s32)(dscale * kernel[-i]); kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff)); } } fscale = vis_to_float(1 << (vis_scale - 1)); vis_write_gsr(((16 - vis_scale) << 3) + 2); offset = mlib_ImageGetLutOffset(colormap); lut_table = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap); bsize = (sw + m) * NCHAN; bsize = (bsize + 7) & ~7; dn1 = (dn) ? dn : 1; pbuff = __mlib_malloc((dn1 + 1) * bsize * sizeof (mlib_u16) + EXTRA_BUFF); if (pbuff == NULL) { if (kern != kern_lcl) __mlib_free(kern); return (MLIB_FAILURE); } for (j = 0; j < dn1; j++) { buff_arr[dn1 + j] = buff_arr[j] = pbuff + j * bsize; } buff_ind = 0; buffd = (mlib_d64 *)(pbuff + dn1 * bsize); buff_dst = (mlib_u8 *)((mlib_u16 *)buffd + bsize); /* clear buffer */ for (i = 0; i < dn * (bsize / 4); i++) { ((mlib_d64 *)pbuff)[i] = 0; } func_ind = dm; if (func_ind > KH_MAX) func_ind = KH_MAX; method = mlib_ImageGetMethod(colormap); if (method == LUT_COLOR_CUBE_SEARCH) func_ind += KH_MAX + 1; else if (method == LUT_COLOR_DIMENSIONS) func_ind += 2 * (KH_MAX + 1); func_dm = func_dm_arr[func_ind]; for (row = 0; row < sh; row++) { mlib_u8 *sp = sl; buff = buff_arr + buff_ind; /* convert source line */ for (i = 0; i < sw; i++) { mlib_d64 ss; ss = LD_U8(sp, i); ss = vis_fmul8x16al(vis_read_lo(ss), fscale); ST_U16(buffd, i, ss); } pkern = kern; for (j = 0; j < dn; j++) { for (off = 0; off < m; off += kw) { kw = m - off; if (kw > KW_MAX) { if (kw > 2 * KW_MAX) kw = KW_MAX; else kw = kw / 2; } func_m_arr[kw] (buffd, buff[j] + off * NCHAN, pkern + off, sw); } pkern += m; } #ifdef USE_COLOR2INDEXLINE func_dm(buff_dst, (void *)buffd, buff[dn] + dm * NCHAN, pkern, colormap, lut_table, sw, dm, 0); /* * mlib_ImageColorTrue2IndexLine_U8_BIT_1 * (buff_dst, dl, bit_offset, sw, colormap); */ #else /* USE_COLOR2INDEXLINE */ func_dm(dl, (void *)buffd, buff[dn] + dm * NCHAN, pkern, colormap, lut_table, sw, dm, bit_offset); #endif /* USE_COLOR2INDEXLINE */ buff_ind++; if (buff_ind >= dn1) buff_ind -= dn1; sl += sll; dl += dll; } __mlib_free(pbuff); if (kern != kern_lcl) __mlib_free(kern); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorCMYK2JFIFYCCK444( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, mlib_u8 *k, const mlib_u8 *cmyk, mlib_s32 n) { mlib_d64 buff_arr[(SIZE / 2) + 2]; mlib_f32 *py, *pcb, *pcr, *pk; mlib_d64 *buff; mlib_d64 sdh, sdl, dr, dg, db, dd; mlib_s32 i, m, size, num; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 8192)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 8192)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 8192)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 8192)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 8192)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 8192)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off255 = vis_to_double_dup(0x1ff01ff0); vis_write_gsr(2 << 3); /* * 4-pixel loop */ for (size = 0; size < n; size += num) { num = n - size; if (num > SIZE) num = SIZE; m = (num + 3) / 4; mlib_channel_separate((mlib_d64 *)cmyk + size / 2, buff_arr, m); m = (num / 4) & ~1; py = (mlib_f32 *)y + size / 4; pcb = (mlib_f32 *)cb + size / 4; pcr = (mlib_f32 *)cr + size / 4; pk = (mlib_f32 *)k + size / 4; buff = buff_arr; #pragma pipeloop(0) for (i = 0; i < m; i++) { sdh = buff[0]; sdl = buff[1]; CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, py[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, pcb[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, pcr[0]); py++; pcb++; pcr++; (*pk++) = vis_read_lo(sdl); buff += 2; } } if (n & 7) { mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF; mlib_d64 rbuff[4]; mlib_f32 *prbuff = (mlib_f32 *)rbuff; sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[2]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[4]); prbuff[6] = vis_read_lo(sdl); sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[1]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[3]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[5]); prbuff[7] = vis_read_lo(sdl); vis_pst_8(rbuff[0], py, emask); vis_pst_8(rbuff[1], pcb, emask); vis_pst_8(rbuff[2], pcr, emask); vis_pst_8(rbuff[3], pk, emask); }
mlib_status mlib_v_conv2x2_u16nw_mask( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u16 *da, *d_a; /* pointers to src, dst data */ mlib_u16 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u16 *sa, *sa1, *sa2, *sa_2; /* pointers to rows in interm. src buf */ mlib_u16 *buff_src, *sbuf1, *sbuf2, *prow; mlib_u16 *s_buf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3; /* data */ mlib_d64 d1, d2, d_1, d_2; /* shifted data */ mlib_d64 d21, d22; /* coefficients */ mlib_f32 k1, k2, k3, k4; int gsr_scale, i, j, nchannel, chan, testchan; mlib_u16 t1, t2, t3, t4, t5, t6, t7, t8; type_mlib_d64 str; mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000); nchannel = mlib_ImageGetChannels(src); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); gsr_scale = 32 - scalef_expon; vis_write_gsr((gsr_scale << 3) + 2); /* buf_slb - 8-byte aligned */ buf_slb = (2 * dw + 26) & (~7); /* alloc. interm. src buffer */ buff_src = (mlib_u16 *)__mlib_malloc(2 * buf_slb * sizeof (mlib_u8) + 8); if (buff_src == NULL) return (MLIB_FAILURE); buf_slb >>= 1; sbuf1 = (mlib_u16 *)((mlib_addr)(buff_src + 8) & (~7)); sbuf2 = sbuf1 + buf_slb; dw -= 1; /* edge - no write */ dh -= 1; testchan = 1; for (chan = nchannel - 1; chan >= 0; chan--) { if ((cmask & testchan) == 0) { testchan <<= 1; continue; } testchan <<= 1; sa = adr_src + chan; sa1 = sa + slb; sa_2 = sa2 = sa1 + slb; d_a = adr_dst + chan; /* load interm. src buff */ for (i = 0, j = 0; j < (dw + 1); i += nchannel, j++) { sbuf1[j] = sa1[i]; sbuf2[j] = sa[i]; } for (j = 0; j < dh - 1; j++) { da = d_a; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; dend = da + (dw - 1) * nchannel; s_buf1 = sbuf1; d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); CONV_16(d22, k4); str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d1 = d_1; d2 = d_2; s1++; s2++; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw - 4; i += 4) { t1 = *sa_2; sa_2 += nchannel; t2 = *sa_2; sa_2 += nchannel; d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); t3 = *sa_2; sa_2 += nchannel; t4 = *sa_2; sa_2 += nchannel; CONV_16(d2, k3); t5 = str.forshort.ushort0; t6 = str.forshort.ushort1; d21 = vis_faligndata(d1, d_1); t7 = str.forshort.ushort2; d22 = vis_faligndata(d2, d_2); t8 = str.forshort.ushort3; CONV_16(d21, k2); (*s_buf1++) = t1; (*s_buf1++) = t2; CONV_16(d22, k4); (*s_buf1++) = t3; (*s_buf1++) = t4; *da = t5; da += nchannel; str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); *da = t6; da += nchannel; d1 = d_1; d2 = d_2; *da = t7; da += nchannel; s1++; s2++; *da = t8; da += nchannel; } for (; i < dw + 1; i++) { (*s_buf1++) = *sa_2; sa_2 += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort0; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort1; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort2; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort3; } sa_2 = sa2 = sa2 + slb; d_a += dlb; } /* process last row - no need to load data */ da = d_a; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; dend = da + (dw - 1) * nchannel; d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); CONV_16(d22, k4); d1 = d_1; d2 = d_2; s1++; s2++; #pragma pipeloop(0) for (i = 4; i < dw; i += 4) { str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); t5 = str.forshort.ushort0; CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); t6 = str.forshort.ushort1; d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); t7 = str.forshort.ushort2; CONV_16(d22, k4); t8 = str.forshort.ushort3; *da = t5; da += nchannel; *da = t6; da += nchannel; *da = t7; da += nchannel; d1 = d_1; d2 = d_2; *da = t8; da += nchannel; s1++; s2++; } str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort0; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort1; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort2; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort3; } } __mlib_free(buff_src); return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2RGB411_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl; /* all. pointer to y */ mlib_d64 *spy; /* all. pointers to u, v */ mlib_d64 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1, fu0, fu1; mlib_d64 dv1, dv2, fv0, fv1; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22; mlib_u8 *tmp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp = (mlib_u8 *)buf; dl = rgb; ddp = (mlib_d64 *)dp; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = (*dfu++); fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = (*dfv++); fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); dd02 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); dd12 = vis_bshuffle(dr1, dg1); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); dd22 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); ddp[3] = vis_bshuffle(dd02, db1); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); ddp[4] = vis_bshuffle(dd12, db1); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); ddp[5] = vis_bshuffle(dd22, db1); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; ddp += 6; } if (i <= width - 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); db = db1; dr = dr1; dg = dg1; ddp += 3; i += 8; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); db = vis_faligndata(db, db); dg = vis_faligndata(dg, dg); dr = vis_faligndata(dr, dr); dp += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3); dl = dp = dl + rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_S8_Mod( mlib_s32 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *psrc = (mlib_s8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fone = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done = vis_to_double_dup(0x1000100); mlib_s32 i = 0; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = vis_ld_d64_nf(dpsrc); vis_write_bmask(0x00012223, 0); if ((mlib_addr)psrc & 7) { dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_faligndata(dsrc, dsrc1); #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3)); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 2); dsrc = vis_faligndata(dsrc0, dsrc1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_bshuffle(dst3, dst3); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv2x2_u16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u16 *da, *d_a; /* pointers to src, dst data */ mlib_u16 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u16 *sa, *sa1; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2; mlib_f32 k1, k2, k3, k4; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; int gsr_scale, i, j; mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); gsr_scale = 32 - scalef_expon; vis_write_gsr((gsr_scale << 3)); buf_slb = (8 * dw + 16) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 1; dw *= 4; dh -= 1; sa = adr_src; sa1 = sa + slb; d_a = adr_dst; /* load interm. src buff */ #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(sbuf2, sa, 4); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(sbuf2, sa1, 4); d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); #pragma pipeloop(0) for (i = 0; i < dw; i += 4) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); CONV_16(d_1, k2); CONV_16(d_2, k4); (*ddst++) = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d1 = d_1; d2 = d_2; s1++; s2++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa1 = sa1 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMulAlpha_U8( mlib_u8 *sl, mlib_u8 *dl, mlib_s32 sstride, mlib_s32 dstride, mlib_s32 width, mlib_s32 height, mlib_s32 channel, mlib_s32 alpha) { mlib_f32 fzeros = vis_fzeros(); mlib_d64 dmask = vis_to_double_dup(0x00FF00FF); mlib_d64 done = vis_to_double_dup(0x01000100); mlib_d64 *buffs, *buffd; mlib_d64 *sp, *dp; mlib_d64 ss, s1, rr, d0, d1; mlib_d64 amask0, amask1, amask2; mlib_s32 ww, dflag, cmask, i, j; vis_write_gsr(7 << 3); width *= channel; ww = (width + 7) / 8; if (channel == 3) { ww = 3 * ((ww + 2) / 3); } buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww); if (buffs == NULL) { return (MLIB_FAILURE); } buffd = buffs + ww; if (channel == 4) { cmask = 1 << (3 - alpha); cmask |= (cmask << 4); } else if (channel == 3) { amask0 = ((mlib_d64 *)mlib_amask3_arr)[alpha]; amask1 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 1]; amask2 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 2]; } for (j = 0; j < height; j++) { if (((int)sl & 7)) { MEM_COPY(sl, buffs, width); sp = buffs; } else { sp = (mlib_d64 *)sl; } dflag = 0; if (((int)dl | width) & 7) { dp = buffd; dflag = 1; } else { dp = (mlib_d64 *)dl; } if (channel == 4) { mlib_d64 a0, a1; if (alpha == 0) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(hi, au); } } else if (alpha == 1) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(hi, al); } } else if (alpha == 2) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(lo, au); } } else { /* if (alpha == 3) */ #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(lo, al); } } } else if (channel == 3) { mlib_d64 s0, s1, s2; mlib_d64 a0, a1, a2; mlib_s32 cmask0, cmask1, cmask2; cmask0 = 0x492 >> alpha; cmask1 = 0x492 >> (alpha + 1); cmask2 = 0x492 >> (alpha + 2); if (alpha == 0) { vis_alignaddr((void *)0, 7); #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_0(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_0_NF(); MUL_ALPHA_3CH(); } } else if (alpha == 1) { mlib_d64 b0, b1, b2; #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_1(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_1_NF(); MUL_ALPHA_3CH(); } } else { /* if (alpha == 2) */ vis_alignaddr((void *)0, 1); #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_2(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_2_NF(); MUL_ALPHA_3CH(); } } } else { /* if (channel == 2) */ if (alpha == 0) {
#endif /* ! defined(__MEDIALIB_OLD_NAMES) */ /* *********************************************************** */ mlib_status __mlib_VideoP64Decimate_U8_U8( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 width, mlib_s32 height, mlib_s32 dst_stride, mlib_s32 src_stride) { mlib_s32 x, y, x4 = width >> 2; mlib_d64 *sl1, *sl2, s1hi, s1lo, s2hi, s2lo, s1, s2; mlib_d64 done = vis_to_double_dup(0x1000100); mlib_d64 dmask; mlib_f32 *dp; mlib_f32 frnd = vis_to_float(0x40404040); mlib_s32 src_stride2 = 2 * src_stride; dmask = vis_fpadd16(done, vis_fone()); vis_write_gsr(7 << 3); sl1 = (mlib_d64 *)src; sl2 = (mlib_d64 *)(src + src_stride); dp = (mlib_f32 *)dst; for (y = 0; y < height; y++) { #pragma pipeloop(0) for (x = 0; x < x4; x++) { s1 = sl1[x];