void mlib_v_ImageAffineTableLine_8nw_2_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_2x2(row00, row10); FILTER_MERGE; MAKE_2x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; #pragma pipeloop(0) for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, fy0); xFilter = vis_write_lo(xFilter, fx0); row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1)); row10 = vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr, srcStride + 1)); v0 = vis_fmul8x16au(vis_read_lo(row00), fy0); v1 = vis_fmul8x16al(vis_read_lo(row10), fy0); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
mlib_status __mlib_VideoUpSample420_Nearest( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)src; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 sa, da; mlib_s32 emask, i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(0) for (i = 0; i <= (n - 8); i += 8) { sa = *sp; *dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); *(dp0 + 1) = *(dp1 + 1) = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); sp++; dp0 += 2; dp1 += 2; } if (i < n) { sa = vis_ld_d64_nf(sp); da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_f32 *pd = (mlib_f32 *)dst; mlib_d64 d0; mlib_d64 tmp0, tmp1, data; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, bias = 0; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { d0 = (*sp0++); tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0)); tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0)); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16(data); } dst = (mlib_u8 *)pd; for (; i < n; i += 2) { (*dst++) = (src[i] + src[i + 1] + bias) >> 1; /* 1=>2, 2=>1 */ bias ^= 1; } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorSplit3_S16( mlib_s16 *color1, mlib_s16 *color2, mlib_s16 *color3, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr(4); vis_write_bmask(0x02CE13DF, 0); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i <= (n - 4); i += 4) { sd0 = sp[0]; sd1 = sp[1]; sd2 = sp[2]; dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); (*dp0++) = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); (*dp1++) = vis_bshuffle(dd3, dd3); (*dp2++) = vis_bshuffle(dd1, dd2); sp += 3; } /* * last 4 pixels */ if (i < n) { mlib_s32 emask = 0xF0 >> (n & 3); mlib_d64 st0, st1, st2; sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); sd2 = vis_ld_d64_nf(sp + 2); dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); st0 = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); st1 = vis_bshuffle(dd3, dd3); st2 = vis_bshuffle(dd1, dd2); vis_pst_16(st0, dp0, emask); vis_pst_16(st1, dp1, emask); vis_pst_16(st2, dp2, emask); }
mlib_status __mlib_VideoAddBlock_U8_S16( mlib_u8 *curr_block, const mlib_s16 *mc_block, mlib_s32 stride) { mlib_s32 y; mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd; mlib_f32 zeros = vis_fzeros(); /* * mlib_s32 mlib_imult = 0x100; * mlib_f32 mult = *(mlib_f32*) & mlib_imult; */ mlib_f32 mult = vis_to_float(0x100); vis_write_gsr(7 << 3); dp = (mlib_d64 *)curr_block; sp = (mlib_d64 *)mc_block; #pragma pipeloop(0) for (y = 0; y < 8; y++) { dd = *dp; s1hi = (*sp++); s1lo = (*sp++); s2hi = vis_fpmerge(zeros, vis_read_hi(dd)); s2lo = vis_fmul8x16al(vis_read_lo(dd), mult); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); *dp = vis_fpack16_pair(s1hi, s1lo); dp = (mlib_d64 *)((mlib_u8 *)dp + stride); } return (MLIB_SUCCESS); }
void __mlib_VideoColorYUV444int_to_UYVY422int( mlib_u32 *uyvy, const mlib_u8 *yuv, mlib_s32 w, mlib_s32 h, mlib_s32 dlb, mlib_s32 slb) { mlib_s32 i, val_y0, val_y1, val_u0, val_v0, count, left; dlb >>= 2; w >>= 1; count = w >> 2; left = w - (count << 2); if (w == 0 || h == 0) return; vis_write_gsr(6 << 3); for (i = 0; i < h; i++, yuv += slb, uyvy += dlb) { if ((((mlib_addr)yuv | (mlib_addr)uyvy) & 7) == 0) { mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0, w_acc1; mlib_d64 w_ld0, w_ld1, w_ld2; mlib_f32 v_one = vis_to_float(0x1000000); mlib_f32 v_u, v_v; mlib_s32 j; #pragma pipeloop(0) for (j = 0; j < count; j++) { w_ld0 = ((mlib_d64 *)yuv)[3 * j]; w_ld1 = ((mlib_d64 *)yuv)[3 * j + 1]; w_ld2 = ((mlib_d64 *)yuv)[3 * j + 2]; MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1, w_ld2); w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); ((mlib_d64 *)uyvy)[2 * j] = VIS_FPMERGE_HI(w_uv, w_y); ((mlib_d64 *)uyvy)[2 * j + 1] = VIS_FPMERGE_LO(w_uv, w_y); } if (left) { mlib_d64 res_buf[2]; w_ld0 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 * count); w_ld1 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 * count + 1); w_ld2 = vis_ld_d64_nf((mlib_d64 *)yuv + 3 * count + 2); MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1, w_ld2); w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y); res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y); for (j = 0; j < left; j++) { ((mlib_f32 *)uyvy)[4 * count + j] = ((mlib_f32 *)res_buf)[j]; } } } else { mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0, w_acc1; mlib_d64 w_ld0, w_ld1, w_ld2; mlib_f32 v_one = vis_to_float(0x1000000); mlib_f32 v_u, v_v; mlib_s32 j; mlib_d64 *al_addr; mlib_d64 l0, l1, l2, l3; const mlib_u8 *pyuv = yuv; mlib_u32 *puyvy = uyvy; if ((mlib_addr)puyvy & 7) { val_y0 = yuv[0]; val_y1 = yuv[3]; val_u0 = (yuv[1] + yuv[4]) >> 1; val_v0 = (yuv[2] + yuv[5]) >> 1; puyvy[0] = (val_u0 << 24) | (val_y0 << 16) | (val_v0 << 8) | val_y1; pyuv += 6; puyvy++; count = (w - 1) >> 2; left = (w - 1) - (count << 2); } else { count = w >> 2; left = w - (count << 2); } al_addr = vis_alignaddr((void *)pyuv, 0); l0 = vis_ld_d64_nf(al_addr); al_addr++; #pragma pipeloop(0) for (j = 0; j < count; j++) { l1 = (*al_addr++); l2 = (*al_addr++); l3 = vis_ld_d64_nf(al_addr); al_addr++; w_ld0 = vis_faligndata(l0, l1); w_ld1 = vis_faligndata(l1, l2); w_ld2 = vis_faligndata(l2, l3); l0 = l3; MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1, w_ld2); w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); ((mlib_d64 *)puyvy)[2 * j] = VIS_FPMERGE_HI(w_uv, w_y); ((mlib_d64 *)puyvy)[2 * j + 1] = VIS_FPMERGE_LO(w_uv, w_y); } if (left) { mlib_d64 res_buf[2]; l1 = vis_ld_d64_nf(al_addr); al_addr++; l2 = vis_ld_d64_nf(al_addr); al_addr++; l3 = vis_ld_d64_nf(al_addr); w_ld0 = vis_faligndata(l0, l1); w_ld1 = vis_faligndata(l1, l2); w_ld2 = vis_faligndata(l2, l3); MLIB_SPLIT3_U8(w_y, w_u, w_v, w_ld0, w_ld1, w_ld2); w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y); res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y); for (j = 0; j < left; j++) { ((mlib_f32 *)puyvy)[4 * count + j] = ((mlib_f32 *)res_buf)[j]; } } count = w >> 2; left = w - (count << 2); }
mlib_status __mlib_VectorConvert_S32_S8_Mod( mlib_s32 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *psrc = (mlib_s8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fone = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done = vis_to_double_dup(0x1000100); mlib_s32 i = 0; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = vis_ld_d64_nf(dpsrc); vis_write_bmask(0x00012223, 0); if ((mlib_addr)psrc & 7) { dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_faligndata(dsrc, dsrc1); #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3)); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 2); dsrc = vis_faligndata(dsrc0, dsrc1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_bshuffle(dst3, dst3); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_U8_Mod( mlib_s32 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *psrc = (mlib_u8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fzero = vis_fzero(), fone1 = vis_to_float(0x100), fone2 = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3; mlib_s32 i = 0, off; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = dpsrc[0]; off = (mlib_addr)psrc & 7; if (off) { dsrc1 = dsrc; vis_alignaddr((void *)0, 7); vis_write_bmask(0x11111111 * off, 0x40516273); #pragma pipeloop(0) #pragma unroll(2) for (; i <= (n - 8); i += 8) { dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_bshuffle(dsrc0, dsrc1); dst0 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst1 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); dsrc = vis_faligndata(dsrc, dsrc); dst2 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fmul8x16al(vis_read_hi(dsrc), fone1); dst0 = vis_fpmerge(fzero, vis_read_hi(dst1)); dst1 = vis_fpmerge(fzero, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst2 = vis_fmuld8ulx16(vis_read_hi(dst3), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dst3), fone2); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
void ADD_SUFF(ByteGrayToIntArgbScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3, dd; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 16) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); for (i = 0; i < width; i++) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } return; } vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { LOAD_NEXT_U8(dd, src + ((tmpsxloc + 3*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + 2*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc ) >> shift)); tmpsxloc += 4*sxinc; ff = vis_read_hi(dd); d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); } while (dst < dst_end) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } }
static mlib_status mlib_v_VideoColorYUV2RGB420_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp11, *sp12, *sl11, *sl12; /* pointers to dst address */ mlib_u8 *dp1, *dl1; /* pointers to dst address */ mlib_u8 *dp2, *dl2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variables */ mlib_s32 i, j; mlib_s32 y_stride2 = 2 * y_stride; mlib_s32 rgb_stride2 = 2 * rgb_stride; mlib_s32 off2, off3; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; mlib_d64 *buf1, BUFF1[16 * 1024]; mlib_d64 *buf2, BUFF2[16 * 1024]; mlib_u8 *tmp1, *tmp2; if (width * 3 > 16 * 1024) { tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7); buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7); } else { buf1 = (mlib_d64 *)BUFF1; buf2 = (mlib_d64 *)BUFF2; } /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp11 = sl11 = (mlib_u8 *)y; sp12 = sl12 = (mlib_u8 *)y + y_stride; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; dl1 = (mlib_u8 *)rgb; dl2 = (mlib_u8 *)(rgb + rgb_stride); ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; /* * row loop */ for (j = 0; j < height / 2; j++) { spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0); spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; vis_alignaddr((void *)off2, 0); fu0 = (*dfu++); fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = (*dfv++); fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy0 = (*spy1++); dy4 = (*spy2++); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(width - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((width - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } sp11 = sl11 = sl11 + y_stride2; sp12 = sl12 = sl12 + y_stride2; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3); __mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3); dl1 = dp1 = dl1 + rgb_stride2; dl2 = dp2 = dl2 + rgb_stride2; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; } if (width * 3 > 16 * 1024) { __mlib_free(tmp1); __mlib_free(tmp2); } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 fzero = vis_fzeros(); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_u8 c; mlib_d64 dsp = vis_to_double_dup(0x800080); mlib_d64 rst = vis_to_double_dup(0x80808080); mlib_f32 fm = vis_to_float(0x100); if (length < 16) { PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d4 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); d2 = vis_fpmerge(fzero, vis_read_hi(d4)); d3 = vis_fmul8x16al(vis_read_lo(d4), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); (*ddst++) = vis_fxor(d4, rst); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d1 = vis_faligndata(d1, d2); d3 = vis_fmul8x16al(vis_read_hi(d1), fm); d4 = vis_fmul8x16al(vis_read_lo(d1), fm); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpadd16(dsp, d4); d1 = vis_fpack16_pair(d3, d4); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_faligndata(d1, d2); d4 = vis_fmul8x16al(vis_read_hi(d3), fm); d5 = vis_fmul8x16al(vis_read_lo(d3), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d3 = vis_fpack16_pair(d4, d5); d4 = vis_fmul8x16al(vis_read_hi(d6), fm); d5 = vis_fmul8x16al(vis_read_lo(d6), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d6 = vis_fpack16_pair(d4, d5); (*ddst++) = vis_fxor(d3, rst); (*ddst++) = vis_fxor(d6, rst); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c; return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2ABGR422_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp1, *sl1; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo; mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo; mlib_d64 dd, dd0, dd1; /* loop variable */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_s32 off; mlib_f32 *dfu, *dfv; mlib_d64 du0, du1, dv0, dv1; mlib_s32 off2, off3; mlib_s32 inc; /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); i = dp - (mlib_u8 *)dpp; emask >>= i; inc = (emask1 != 0xff); emask1 &= emask; off = 8 - i; vis_alignaddr((void *)off2, 0); fu0 = vis_ld_f32_nf(dfu); dfu++; fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = vis_ld_f32_nf(dfv); dfv++; fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); vis_alignaddr(sp1, 0); dy0 = vis_ld_d64_nf(spy); spy++; dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; vis_alignaddr((void *)off, 0); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_lo), vis_read_lo(blue_red_lo)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); vis_alignaddr(sp1, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; emask1 = emask; } if (i < width) { vis_alignaddr((void *)off, 0); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dd0 = dd1; dpp += inc; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_hi (x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; } } } vis_alignaddr((void *)off, 0); emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoUpSample420( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src0, const mlib_u8 *src1, const mlib_u8 *src2, mlib_s32 n) { mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_d64 *sp0 = (mlib_d64 *)src0; mlib_d64 *sp1 = (mlib_d64 *)src1; mlib_d64 *sp2 = (mlib_d64 *)src2; mlib_d64 d00, d01, d10, d11, d20, d21; mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo; mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo; mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo; mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo; mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7; mlib_d64 data0, data1, data2, data3, tmp0, tmp1; mlib_f32 fone = vis_to_float(0x4000000); mlib_f32 fthree = vis_to_float(0xC000000); mlib_f32 fone1 = vis_to_float(0x40404040); mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0); mlib_d64 dseven = vis_to_double_dup(0x70007); mlib_d64 deight = vis_to_double_dup(0x80008); mlib_s32 i, emask; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr((3 << 3) + 2); d00 = vis_ld_d64_nf(sp0); d10 = vis_ld_d64_nf(sp1); d20 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone); lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone); lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone); lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree); lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0); lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1); lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0); lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1); #pragma pipeloop(0) for (i = 0; i < n - 8; i += 8) { d01 = *sp0; d11 = *sp1; d21 = *sp2; sp0++; sp1++; sp2++; thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone); thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone); thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone); thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree); thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0); thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1); thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0); thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1); acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, thiscolsum0_hi); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, thiscolsum1_hi); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); dp0 += 2; dp1 += 2; lastcolsum0_hi = thiscolsum0_hi; lastcolsum0_lo = thiscolsum0_lo; lastcolsum1_hi = thiscolsum1_hi; lastcolsum1_lo = thiscolsum1_lo; } if (i < n) { acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, lastcolsum0_lo); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, lastcolsum1_lo); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { acc0 = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); acc1 = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } } vis_write_gsr(7); dp0 = (mlib_d64 *)dst0; dp1 = (mlib_d64 *)dst1; ac0 = *dp0; ac2 = *dp1; #pragma pipeloop(0) for (i = 0; i < 2 * n - 8; i += 8) { ac1 = *dp0; ac3 = *dp1; *dp0 = vis_faligndata(ac0, ac1); *dp1 = vis_faligndata(ac2, ac3); dp0++; dp1++; ac0 = ac1; ac2 = ac3; } if (i < 2 * n) { ac1 = vis_ld_d64_nf(dp0); ac3 = vis_ld_d64_nf(dp1); emask = vis_edge8(dp0, dend0); acc0 = vis_faligndata(ac0, ac1); acc1 = vis_faligndata(ac2, ac3); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4; dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4; dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4; dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4; return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2ABGR411_dst_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1; mlib_d64 dv1, dv2; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dd, dd0, dd1, dtmp; /* used to load u, v into mlib_f32 */ mlib_f32 ffu[1], ffv[1]; /* used to load u, v into mlib_f32 */ mlib_u8 *ufu, *vfu; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variables */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_d64 *buf; mlib_s32 inc; ufu = (mlib_u8 *)ffu; vfu = (mlib_u8 *)ffv; /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64)); if (buf == NULL) return (MLIB_FAILURE); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = buf; dy0 = vis_ld_d64_nf(spy); spy++; #pragma pipeloop(0) for (i = 0; i < width; i += 8) { dy1 = vis_ld_d64_nf(spy); spy++; (*dpp++) = vis_faligndata(dy0, dy1); dy0 = dy1; } spy = buf; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = dp - (mlib_u8 *)dpp; emask >>= i; vis_alignaddr((void *)(8 - i), 0); inc = (emask1 != 0xff); emask1 &= emask; ufu[0] = vis_ld_u8_nf(sp2); ufu[1] = vis_ld_u8_nf(sp2 + 1); ufu[2] = vis_ld_u8_nf(sp2 + 2); ufu[3] = vis_ld_u8_nf(sp2 + 3); vfu[0] = vis_ld_u8_nf(sp3); vfu[1] = vis_ld_u8_nf(sp3 + 1); vfu[2] = vis_ld_u8_nf(sp3 + 2); vfu[3] = vis_ld_u8_nf(sp3 + 3); sp2 += 4; sp3 += 4; fu = ffu[0]; fv = ffv[0]; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { dy1 = (*spy++); dy2 = (*spy++); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ufu[0] = vis_ld_u8_nf(sp2); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); ufu[1] = vis_ld_u8_nf(sp2 + 1); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); ufu[2] = vis_ld_u8_nf(sp2 + 2); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); ufu[3] = vis_ld_u8_nf(sp2 + 3); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); vfu[0] = vis_ld_u8_nf(sp3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); vfu[1] = vis_ld_u8_nf(sp3 + 1); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); vfu[2] = vis_ld_u8_nf(sp3 + 2); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); vfu[3] = vis_ld_u8_nf(sp3 + 3); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); fu = ffu[0]; fv = ffv[0]; sp2 += 4; sp3 += 4; emask1 = emask; } if (i <= width - 8) { dy1 = (*spy++); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ufu[0] = ufu[2]; db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); vfu[0] = vfu[2]; db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr = vis_fpack16_pair(dr1, dr2); dg = vis_fpack16_pair(dg1, dg2); db = vis_fpack16_pair(db1, db2); dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); fu = ffu[0]; fv = ffv[0]; i += 8; emask1 = emask; } if (i < width) { dy1 = vis_ld_d64_nf(spy); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); fu = vis_fpack16(db1); dg2 = vis_fpmerge(fu, vis_fpack16(dg1)); dg3 = vis_fpmerge(fu, vis_fpack16(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); } emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } __mlib_free(buf); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB420_Nearest( mlib_u8 *rgb0, mlib_u8 *rgb1, const mlib_u8 *y0, const mlib_u8 *y1, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 n) { /* pointers to dst address */ mlib_u8 *dp1, *dp2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 du, dv; /* (1.00000, 1.40200)*8192 */ mlib_f32 k12 = vis_to_float(0x20002cdd); /* (-.34414, -.71414)*8192 */ mlib_f32 k34 = vis_to_float(0xf4fde926); /* 1.77200*8192 */ mlib_f32 k5 = vis_to_float(0x10038b4); /* (179.45600 - 0.5)*32 */ mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f); /* (135.45984 + 0.5)*32 */ mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff); /* (226.81600 - 0.5)*32 */ mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variable */ mlib_s32 i; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; if (n <= 0) return (MLIB_FAILURE); /* * initialize GSR scale factor */ vis_write_gsr((2 << 3) + 7); dp1 = (mlib_u8 *)rgb0; dp2 = (mlib_u8 *)rgb1; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; spy1 = (mlib_d64 *)y0; spy2 = (mlib_d64 *)y1; dfu = (mlib_f32 *)cb; dfv = (mlib_f32 *)cr; fu = vis_ld_f32_nf(dfu); dfu++; fv = vis_ld_f32_nf(dfv); dfv++; du = vis_fpmerge(fu, fu); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(n - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((n - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((n - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < n; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_U8_S8_Sat( mlib_u8 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i, off; mlib_s8 c; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 zero = vis_fzeros(); if (length < 16) { PACK_S_U(mlib_s8, mlib_u8); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); off = (mlib_addr)src & 7; vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (i; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); d4 = vis_fmul8sux16(d6, four_16_ones); d6 = vis_faligndata(d6, d6); d5 = vis_fmul8sux16(d6, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c; return (MLIB_SUCCESS); }
void ADD_SUFF(ByteGrayToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; for (i = 0; i < width; i++) { x = src[i]; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == width && dstScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; dst_end = dst + width; while (((mlib_s32)src & 3) && dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { ff = *(mlib_f32*)src; d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); src += 4; } while (dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
mlib_status __mlib_VectorConvert_S8_S16_Sat( mlib_s8 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6, d7; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s16 c; if (n < 16) { PACK_S_S(mlib_s16, mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN); } /* * First try to align destination address for 8 bytes . */ while ((mlib_s32)dst & 7) { (*dst++) = (c = (*src++)) < MLIB_S8_MIN ? MLIB_S8_MIN : (c > MLIB_S8_MAX ? MLIB_S8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr64(((mlib_u64)0x082A4C6E << 32) | (8 << 3) | 2); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { dsrc = (mlib_d64 *)src; if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); } #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d1, d2); d3 = vis_fpackfix_pair(d4, d5); d4 = vis_fpack32(d4, d4); d4 = vis_fpack32(d4, d4); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5)); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d4 = vis_fpackfix_pair(d4, d5); (*ddst++) = vis_bshuffle(d3, d4); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (i; i < len_64; i += 2) { d1 = d2; d2 = (*dsrc++); d4 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d5 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d6 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d7 = vis_faligndata(d1, d2); d3 = vis_fpackfix_pair(d4, d5); d4 = vis_fpack32(d4, d4); d4 = vis_fpack32(d4, d4); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5)); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d4 = vis_fpackfix_pair(d4, d5); d5 = vis_fpackfix_pair(d6, d7); d6 = vis_fpack32(d6, d6); d6 = vis_fpack32(d6, d6); d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7)); d7 = vis_fpmerge(vis_read_lo(d7), vis_read_hi(d7)); d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7)); d6 = vis_fpackfix_pair(d6, d7); (*ddst++) = vis_bshuffle(d3, d4); (*ddst++) = vis_bshuffle(d5, d6); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c < MLIB_S8_MIN ? MLIB_S8_MIN : (c > MLIB_S8_MAX ? MLIB_S8_MAX : c); } return (MLIB_SUCCESS); }
void ADD_SUFF(UshortGrayToByteGrayConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u8 *dst_end; mlib_d64 s0, s1, ss; mlib_s32 i, j; if (width <= 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u8 *dst = dstBase; for (i = 0; i < width; i++) { dst[i] = src[2*i]; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == 2*width && dstScan == width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u8 *dst = dstBase; mlib_d64 *sp; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { *dst++ = *src; src += 2; } if ((mlib_s32)src & 7) { sp = vis_alignaddr(src, 0); s1 = *sp++; #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s0 = s1; s1 = *sp++; ss = vis_faligndata(s0, s1); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); *(mlib_f32*)dst = vis_read_hi(ss); src += 2*4; } } else { #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { ss = *(mlib_d64*)src; ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); *(mlib_f32*)dst = vis_read_hi(ss); src += 2*4; } } while (dst < dst_end) { *dst++ = *src; src += 2; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
mlib_status __mlib_VectorConvert_U8_S32_Sat( mlib_u8 *z, const mlib_s32 *x, mlib_s32 n) { mlib_s32 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d0, d_tmp, d1, d2, d3, d4; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s32 c; if (n < 8) { PACK_S_S(mlib_s32, mlib_u8, MLIB_U8_MAX, 0); } /* * First try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(23 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < len_64; i++) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = (*dsrc++); d1 = vis_fpack32(d1, d1); d2 = vis_fpack32(d1, d2); d3 = vis_fpack32(d2, d3); d4 = vis_fpack32(d3, d4); (*ddst++) = vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4)); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d0 = (*dsrc++); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < len_64; i++) { d_tmp = (*dsrc++); d1 = vis_faligndata(d0, d_tmp); d0 = (*dsrc++); d2 = vis_faligndata(d_tmp, d0); d_tmp = (*dsrc++); d3 = vis_faligndata(d0, d_tmp); d0 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d_tmp, d0); d1 = vis_fpack32(d1, d1); d2 = vis_fpack32(d1, d2); d3 = vis_fpack32(d2, d3); d4 = vis_fpack32(d3, d4); (*ddst++) = vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4)); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c < MLIB_U8_MIN ? MLIB_U8_MIN : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
static void mlib_v_VideoYUV2ABGR_aarray_411( mlib_u32 *abgr, const mlib_d64 *y, const mlib_f32 *u, const mlib_f32 *v, const mlib_d64 *a_array, mlib_s32 count, mlib_s32 left, mlib_s32 isrgb) { /* all. pointer to dst */ mlib_d64 *dpp = (mlib_d64 *)abgr; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1; mlib_d64 dv1, dv2; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 *dpa, da0, da1, da2, da3, da4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variables */ mlib_s32 i; if (isrgb) { f0 = vis_to_float(0x12a1); f1 = vis_to_float(0x3317); f4 = vis_to_float(0xe5fa); f5 = vis_to_float(0xf375); f8 = vis_to_float(0x4097); doff0 = vis_to_double_dup(0xe420e420); doff1 = vis_to_double_dup(0x10f410f4); doff2 = vis_to_double_dup(0xdd60dd60); } dpa = vis_alignaddr((void *)a_array, 0); dy1 = (*y++); dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++; fu = (*u++); fv = (*v++); da2 = (*dpa++); da3 = vis_ld_d64_nf(dpa); dpa++; da4 = vis_ld_d64_nf(dpa); dpa++; du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); if (!((mlib_addr)abgr & 7)) { #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dpp[8 * i] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dpp[8 * i + 2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dpp[8 * i + 4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dpp[8 * i + 6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } else { mlib_d64 dd; #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } if (left) { mlib_d64 res_buf[8]; da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); for (i = 0; i < left; i++) ((mlib_f32 *)dpp)[16 * count + i] = ((mlib_f32 *)res_buf)[i]; } }
mlib_status __mlib_VectorConvert_S16_U8_Mod( mlib_s16 *z, const mlib_u8 *x, mlib_s32 n) { mlib_s32 i; const mlib_u8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_f32 fzero = vis_fzeros(); mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 fm = vis_to_float(0x100); if (length < 16) { EXPAND(mlib_u8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; if (!((mlib_addr)src & 7)) { /* * Both vectors are 64-bit aligned. We can process without * vis_faligndata * Peeling the 1 iteration. Then loop with step==2. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling the 1 iteration. Then loop with step==2. */ i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd3 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); } #pragma pipeloop(0) #pragma unroll(2) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
static mlib_status mlib_MatrixMul_S8xS8( void *z, const STYPE * x, const STYPE * y, mlib_s32 m, mlib_s32 l, mlib_s32 n, mlib_s32 dst_type) { mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y; mlib_d64 array[MAX_SIZE]; mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3; mlib_s32 size, i, j, k, l4; mlib_s32 vmin, vmax; if (!((m > 0) && (l > 0) && (n > 0))) { return (MLIB_FAILURE); } if (!dst_type) { vmin = MLIB_S8_MIN; vmax = MLIB_S8_MAX; } else { /* if (dst_type == 1) */ vmin = MLIB_S16_MIN; vmax = MLIB_S16_MAX; } l4 = (l + 3) / 4; size = l4 * n + l4 + 3; if (size <= MAX_SIZE) { buff_y = array; } else { buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64)); if (buff_y == NULL) { mlib_s32 type_z, mode; if (!dst_type) { type_z = type_S8; mode = mode_Sat; } else if (dst_type == 1) { type_z = type_S16; mode = mode_Sat; } else { /* if (dst_type == 2) */ type_z = type_S16; mode = mode_Mod; } return mlib_MatrixMul_type(type_S8, type_z, mode, x, y, m, l, n, n, z); } } buff_x = buff_y + l4 * n; pbuff_y = buff_y; /* transpose y matrix */ for (i = 0; i < n; i++) { STYPE *py = (STYPE *) y + i; mlib_s16 *pp = (mlib_s16 *)pbuff_y; for (j = 0; j <= (l - 4); j += 4) { ((mlib_s32 *)pp)[0] = (py[0] << 16) | (py[n] & 0xFFFF); ((mlib_s32 *)pp)[1] = (py[2 * n] << 16) | (py[3 * n] & 0xFFFF); py += 4 * n; pp += 4; } for (; j < l; j++) { (*pp++) = *py; py += n; } for (; j < 4 * l4; j++) { (*pp++) = 0; } pbuff_y += l4; } for (j = 0; j < m; j++) { pbuff_x = buff_x; pbuff_y = buff_y; /* copy x line */ px = vis_alignaddr((void *)x, j * l); x1 = vis_ld_d64_nf(px); px++; for (i = 0; i < (l + 7) / 8; i++) { x0 = x1; x1 = vis_ld_d64_nf(px); px++; xx = vis_faligndata(x0, x1); pbuff_x[2 * i] = vis_fpmerge(vis_read_hi(xx), vis_read_hi(xx)); pbuff_x[2 * i + 1] = vis_fpmerge(vis_read_lo(xx), vis_read_lo(xx)); } /* loop on y lines */ for (i = 0; i < n; i += 2) { mlib_d64 *px = pbuff_x; mlib_d64 *py0 = pbuff_y; mlib_d64 *py1 = (i + 1 < n) ? (py0 + l4) : py0; mlib_s32 s0, s1; ds0 = ds1 = vis_fzero(); LOAD; MUL; LOAD; #pragma pipeloop(0) for (k = 0; k < l4; k++) { SUM; MUL; LOAD; } s0 = ((mlib_s32 *)&ds0)[0] + ((mlib_s32 *)&ds0)[1]; SATUR(s0); if (dst_type) { ((mlib_s16 *)z)[i] = s0; } else { ((mlib_u8 *)z)[i] = s0; } if (i + 1 < n) { s1 = ((mlib_s32 *)&ds1)[0] + ((mlib_s32 *)&ds1)[1]; SATUR(s1); if (dst_type) { ((mlib_s16 *)z)[i + 1] = s1; } else { ((mlib_u8 *)z)[i + 1] = s1; } } pbuff_y += 2 * l4; } z = (mlib_u8 *)z + ((dst_type) ? (2 * n) : n); } if (size > MAX_SIZE) { __mlib_free(buff_y); } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s32 i; const mlib_s8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 fzero = vis_fzeros(); mlib_s32 len_64, even_length, rest_64, length = n, off; mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7; if (length < 16) { EXPAND(mlib_s8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; off = (mlib_addr)src & 7; if (!off) { /* * Both vectors are 64-bit aligned. */ /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2), fzero), four_16_ones); } } else { /* * Source vector is not 64-bit aligned. * Peeling of 1 iteration. Then loop with step==2. */ vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd4 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i <= len_64; i += 2) { dd0 = dd2; dd1 = vis_ld_d64_nf(ddsrc + i); dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_bshuffle(dd0, dd1); dd6 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); dd7 = vis_faligndata(dd6, dd6); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); (*ddst++) = vis_fmul8sux16(dd6, four_16_ones); (*ddst++) = vis_fmul8sux16(dd7, four_16_ones); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_MatrixMul_S16_S8_Mod( mlib_s16 *z, const STYPE * x, const STYPE * y, mlib_s32 m, mlib_s32 l, mlib_s32 n) { mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y; mlib_d64 array[MAX_SIZE]; mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3; mlib_s32 size, i, j, k, l8; if (!((m > 0) && (l > 0) && (n > 0))) { return (MLIB_FAILURE); } l8 = (l + 7) / 8; size = l8 * n + 2 * l8 + 4; if (size <= MAX_SIZE) { buff_y = array; } else { buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64)); if (buff_y == NULL) { return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat, x, y, m, l, n, n, z); } } buff_x = buff_y + l8 * n; pbuff_y = buff_y; /* transpose y matrix */ for (i = 0; i < n; i++) { mlib_u8 *py = (mlib_u8 *)y + i; mlib_u8 *pp = (mlib_u8 *)pbuff_y; for (j = 0; j <= (l - 4); j += 4) { ((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080; ((mlib_s16 *)pp)[1] = ((py[2 * n] << 8) | py[3 * n]) ^ 0x8080; py += 4 * n; pp += 4; } for (; j < l; j++) { (*pp++) = *py ^ 0x80; py += n; } for (; j < 8 * l8; j++) { (*pp++) = 0; } pbuff_y += l8; } for (j = 0; j < m; j++) { mlib_s32 x_sum = 0; for (i = 0; i < l; i++) { x_sum += x[i]; } x_sum <<= 7; pbuff_x = buff_x; pbuff_y = buff_y; /* copy x line */ px = vis_alignaddr((void *)x, 0); x1 = vis_ld_d64_nf(px); px++; xx = 0; for (i = 0; i < l8; i++) { x0 = x1; x1 = vis_ld_d64_nf(px); px++; xx = vis_faligndata(x0, x1); pbuff_x[2 * i] = vis_fpmerge(vis_read_hi(xx), vis_fzeros()); pbuff_x[2 * i + 1] = vis_fpmerge(vis_read_lo(xx), vis_fzeros()); } /* loop on y lines */ for (i = 0; i < n; i += 2) { mlib_d64 *px = pbuff_x; mlib_d64 *py0 = pbuff_y; mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0; ds0 = ds1 = vis_fzero(); LOAD; MUL; LOAD; #pragma pipeloop(0) for (k = 0; k < l8; k++) { SUM; MUL; LOAD; } ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0), vis_read_lo(ds0)), vis_fpadd16s(vis_read_hi(ds1), vis_read_lo(ds1))); z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] - x_sum; if (i + 1 < n) { z[i + 1] = ((mlib_s16 *)&ds0)[2] + ((mlib_s16 *)&ds0)[3] - x_sum; } pbuff_y += 2 * l8; } z += n; x += l; } if (size > MAX_SIZE) { __mlib_free(buff_y); } return (MLIB_SUCCESS); }
mlib_status mlib_ImageChannelMerge4_S16( mlib_s16 *dst_s16_0, const mlib_s16 *src_s16_0, const mlib_s16 *src_s16_1, const mlib_s16 *src_s16_2, const mlib_s16 *src_s16_3, mlib_s32 height, mlib_s32 width, mlib_s32 dst_stride, mlib_s32 src0_stride, mlib_s32 src1_stride, mlib_s32 src2_stride, mlib_s32 src3_stride) { mlib_s32 i, j, n = width << 2; mlib_s16 *fi_ptr, *se_ptr, *th_ptr, *fo_ptr; mlib_d64 *dp; for (j = 0; j < height; j++) { i = 0; if ((mlib_addr)(dst_s16_0 + i) & 7) { dst_s16_0[i++] = src_s16_0[0]; if ((mlib_addr)(dst_s16_0 + i) & 7) { dst_s16_0[i++] = src_s16_1[0]; if ((mlib_addr)(dst_s16_0 + i) & 7) { dst_s16_0[i++] = src_s16_2[0]; } } } if (i == 0) { fi_ptr = (mlib_s16 *)src_s16_0; se_ptr = (mlib_s16 *)src_s16_1; th_ptr = (mlib_s16 *)src_s16_2; fo_ptr = (mlib_s16 *)src_s16_3; } else if (i == 1) { fi_ptr = (mlib_s16 *)src_s16_1; se_ptr = (mlib_s16 *)src_s16_2; th_ptr = (mlib_s16 *)src_s16_3; fo_ptr = (mlib_s16 *)(src_s16_0 + 1); } else if (i == 2) { fi_ptr = (mlib_s16 *)src_s16_2; se_ptr = (mlib_s16 *)src_s16_3; th_ptr = (mlib_s16 *)(src_s16_0 + 1); fo_ptr = (mlib_s16 *)(src_s16_1 + 1); } else if (i == 3) { fi_ptr = (mlib_s16 *)src_s16_3; se_ptr = (mlib_s16 *)(src_s16_0 + 1); th_ptr = (mlib_s16 *)(src_s16_1 + 1); fo_ptr = (mlib_s16 *)(src_s16_2 + 1); } dp = (mlib_d64 *)(dst_s16_0 + i); if ((n - i) > 16) { if (((mlib_addr)fi_ptr & 7) || ((mlib_addr)se_ptr & 7) || ((mlib_addr)th_ptr & 7) || ((mlib_addr)fo_ptr & 7)) { mlib_d64 sd0, sd1, sd2, sd3; mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13; mlib_d64 s0h, s0l, s1h, s1l, s2h, s2l, s3h, s3l; mlib_d64 *sp0; mlib_d64 *sp1; mlib_d64 *sp2; mlib_d64 *sp3; sp0 = VIS_ALIGNADDR(fi_ptr, 0); s0h = (*sp0++); sp1 = VIS_ALIGNADDR(se_ptr, 0); s1h = (*sp1++); sp2 = VIS_ALIGNADDR(th_ptr, 0); s2h = (*sp2++); sp3 = VIS_ALIGNADDR(fo_ptr, 0); s3h = (*sp3++); #pragma pipeloop(0) for (; i < (n - 15); i += 16) { s0l = vis_ld_d64_nf(sp0); sp0++; s1l = vis_ld_d64_nf(sp1); sp1++; s2l = vis_ld_d64_nf(sp2); sp2++; s3l = vis_ld_d64_nf(sp3); sp3++; VIS_ALIGNADDR(fi_ptr, 0); sd0 = vis_faligndata(s0h, s0l); VIS_ALIGNADDR(se_ptr, 0); sd1 = vis_faligndata(s1h, s1l); VIS_ALIGNADDR(th_ptr, 0); sd2 = vis_faligndata(s2h, s2l); VIS_ALIGNADDR(fo_ptr, 0); sd3 = vis_faligndata(s3h, s3l); dr02 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); dr13 = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); dd0 = vis_fpmerge(vis_read_hi(dr02), vis_read_hi(dr13)); dp[0] = vis_fpmerge(vis_read_hi(dd0), vis_read_lo(dd0)); dd1 = vis_fpmerge(vis_read_lo(dr02), vis_read_lo(dr13)); dp[1] = vis_fpmerge(vis_read_hi(dd1), vis_read_lo(dd1)); dr02 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); dr13 = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); dd2 = vis_fpmerge(vis_read_hi(dr02), vis_read_hi(dr13)); dp[2] = vis_fpmerge(vis_read_hi(dd2), vis_read_lo(dd2)); dd3 = vis_fpmerge(vis_read_lo(dr02), vis_read_lo(dr13)); dp[3] = vis_fpmerge(vis_read_hi(dd3), vis_read_lo(dd3)); dp += 4; s0h = s0l; s1h = s1l; s2h = s2l; s3h = s3l; fi_ptr += 4; se_ptr += 4; th_ptr += 4; fo_ptr += 4; } } else { mlib_d64 sd0, sd1, sd2, sd3; mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13; #pragma pipeloop(0) for (; i < (n - 15); i += 16) { sd0 = ((mlib_d64 *)fi_ptr)[0]; sd1 = ((mlib_d64 *)se_ptr)[0]; sd2 = ((mlib_d64 *)th_ptr)[0]; sd3 = ((mlib_d64 *)fo_ptr)[0]; dr02 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd2)); dr13 = vis_fpmerge(vis_read_hi(sd1), vis_read_hi(sd3)); dd0 = vis_fpmerge(vis_read_hi(dr02), vis_read_hi(dr13)); dp[0] = vis_fpmerge(vis_read_hi(dd0), vis_read_lo(dd0)); dd1 = vis_fpmerge(vis_read_lo(dr02), vis_read_lo(dr13)); dp[1] = vis_fpmerge(vis_read_hi(dd1), vis_read_lo(dd1)); dr02 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd2)); dr13 = vis_fpmerge(vis_read_lo(sd1), vis_read_lo(sd3)); dd2 = vis_fpmerge(vis_read_hi(dr02), vis_read_hi(dr13)); dp[2] = vis_fpmerge(vis_read_hi(dd2), vis_read_lo(dd2)); dd3 = vis_fpmerge(vis_read_lo(dr02), vis_read_lo(dr13)); dp[3] = vis_fpmerge(vis_read_hi(dd3), vis_read_lo(dd3)); dp += 4; fi_ptr += 4; se_ptr += 4; th_ptr += 4; fo_ptr += 4; } } } #pragma pipeloop(0) for (; i < (n - 3); i += 4) { dst_s16_0[i + 0] = (*fi_ptr++); dst_s16_0[i + 1] = (*se_ptr++); dst_s16_0[i + 2] = (*th_ptr++); dst_s16_0[i + 3] = (*fo_ptr++); } if (i < (n - 2)) { dst_s16_0[i + 0] = *fi_ptr; dst_s16_0[i + 1] = *se_ptr; dst_s16_0[i + 2] = *th_ptr; } else if (i < (n - 1)) { dst_s16_0[i + 0] = *fi_ptr; dst_s16_0[i + 1] = *se_ptr; } else if (i < n) { dst_s16_0[i + 0] = *fi_ptr; } dst_s16_0 += dst_stride; src_s16_0 += src0_stride; src_s16_1 += src1_stride; src_s16_2 += src2_stride; src_s16_3 += src3_stride; } return (MLIB_SUCCESS); }
void __mlib_VideoColorYUV444seq_to_UYVY422int( mlib_u32 *uyvy, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 w, mlib_s32 h, mlib_s32 dlb, mlib_s32 slb) { mlib_s32 i, j2, val_y0, val_y1, val_u0, val_v0, count, left; dlb >>= 2; w >>= 1; if (w == 0 || h == 0) return; count = w >> 2; left = w - (count << 2); vis_write_gsr(6 << 3); for (i = 0; i < h; i++, y += slb, u += slb, v += slb, uyvy += dlb) { if ((((mlib_addr)u | (mlib_addr)v | (mlib_addr)y | (mlib_addr) uyvy) & 7) == 0) { mlib_d64 w_y, w_u, w_v, w_uv, w_tmp0, w_tmp1, w_acc0, w_acc1; mlib_f32 v_one = vis_to_float(0x1000000); mlib_f32 v_u, v_v; mlib_s32 j; #pragma pipeloop(0) for (j = 0; j < count; j++) { w_y = ((mlib_d64 *)y)[j]; w_u = ((mlib_d64 *)u)[j]; w_v = ((mlib_d64 *)v)[j]; w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); ((mlib_d64 *)uyvy)[2 * j] = VIS_FPMERGE_HI(w_uv, w_y); ((mlib_d64 *)uyvy)[2 * j + 1] = VIS_FPMERGE_LO(w_uv, w_y); } if (left) { mlib_d64 res_buf[2]; w_y = ((mlib_d64 *)y)[count]; w_u = ((mlib_d64 *)u)[count]; w_v = ((mlib_d64 *)v)[count]; w_tmp0 = vis_fpmerge(vis_read_hi(w_u), vis_read_lo(w_u)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_u = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_tmp0 = vis_fpmerge(vis_read_hi(w_v), vis_read_lo(w_v)); w_tmp1 = vis_fpmerge(vis_read_hi(w_tmp0), vis_read_lo(w_tmp0)); w_acc0 = vis_fmul8x16au(vis_read_hi(w_tmp1), v_one); w_acc1 = vis_fmul8x16au(vis_read_lo(w_tmp1), v_one); v_v = vis_fpack16(vis_fpadd16(w_acc0, w_acc1)); w_uv = vis_fpmerge(v_u, v_v); res_buf[0] = VIS_FPMERGE_HI(w_uv, w_y); res_buf[1] = VIS_FPMERGE_LO(w_uv, w_y); for (j = 0; j < left; j++) { ((mlib_f32 *)uyvy)[4 * count + j] = ((mlib_f32 *)res_buf)[j]; } } } else { #pragma pipeloop(0) for (j2 = 0; j2 < w; j2++) { mlib_s32 j = 2 * j2; mlib_s32 j1 = j + 1; val_y0 = y[j]; val_y1 = y[j1]; val_u0 = (u[j] + u[j1]) >> 1; val_v0 = (v[j] + v[j1]) >> 1; uyvy[j2] = (val_u0 << 24) | (val_y0 << 16) | (val_v0 << 8) | val_y1; } } } }