mlib_status __mlib_VideoInterpAveX_U8_U8_8x16( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_s32 y; mlib_d64 *dd, ss0[16], *sp1, *sp2, s1hi, s1lo, s2hi, s2lo, s2; mlib_d64 mthree = vis_fone(); mlib_f32 fzero = vis_fzeros(); mlib_f32 fexpd2 = vis_to_float(0x1000200); mthree = vis_fpadd16(mthree, vis_fpadd16(mthree, mthree)); dd = (mlib_d64 *)curr_block; sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0); #pragma pipeloop(0) MLIB_V_VIDEOCOPY8(16); vis_write_gsr((5 << 3) + ((mlib_s32)(ref_block + 1) & 7)); sp2 = (mlib_d64 *)((mlib_addr)(ref_block + 1) & ~7); #pragma pipeloop(0) MLIB_V_VIDEOINTERPAVG8(16); return (MLIB_SUCCESS); }
static void FUNC( m4) ( FUNC_M_ARG) { mlib_s32 i; mlib_d64 k0 = pkern[0]; mlib_d64 k1 = pkern[1]; mlib_d64 k2 = pkern[2]; mlib_d64 k3 = pkern[3]; mlib_d64 a0, a1, a2, a3, sum; mlib_d64 *perror = vis_alignaddr(perror1, 0); a0 = (*perror++); a1 = (*perror++); a2 = (*perror++); for (i = 0; i < sw; i++) { a3 = (*perror++); sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0)); sum = vis_fpadd16(sum, FMUL_16x16(k1, a1)); sum = vis_fpadd16(sum, FMUL_16x16(k2, a2)); buffd[i] = vis_fpadd16(sum, FMUL_16x16(k3, a3)); a0 = a1; a1 = a2; a2 = a3; } }
void mlib_v_ImageAffineTableLine_8nw_3_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; mlib_d64 yFilter2; mlib_d64 yFilter3; mlib_d64 row20, row30; mlib_d64 *dpSrc; mlib_d64 data0, data1, zero; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; zero = vis_to_double_dup(0); #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_3x2; FILTER_MERGE_4x2; MAKE_4x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, yFilter); xFilter = vis_write_hi(xFilter, fx0); LOAD_PIXEL_3; v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); sum = vis_fpadd16(v0, sum); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
mlib_status __mlib_VideoDCT4x4_S16_S16( mlib_s16 *coeff, const mlib_s16 *blk) { mlib_d64 a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3; mlib_f32 a = ((mlib_f32 *)mlib_dct4vtab)[0]; mlib_f32 b = ((mlib_f32 *)mlib_dct4vtab)[1]; mlib_f32 c = ((mlib_f32 *)mlib_dct4vtab)[2]; mlib_d64 *src = (mlib_d64 *)blk; mlib_d64 *dst = (mlib_d64 *)coeff; /* column 1D DCT */ vis_write_bmask(0x018923ab, 0x0); a3 = vis_fpsub16(src[0], src[3]); a0 = vis_fpadd16(src[0], src[3]); a2 = vis_fpsub16(src[1], src[2]); a1 = vis_fpadd16(src[1], src[2]); c1 = vis_fpadd16(a3, a2); c3 = vis_fpsub16(a3, a2); b1 = vis_fpadd16(vis_fmul8x16(a, c1), vis_fmul8x16(b, c3)); b3 = vis_fpsub16(vis_fmul8x16(a, c3), vis_fmul8x16(b, c1)); b0 = vis_fpadd16(a0, a1); b2 = vis_fpsub16(a0, a1); TRANSPOSE_VIS2(b0, b1, b2, b3, c0, c1, c2, c3); a3 = vis_fpsub16(c0, c3); a0 = vis_fpadd16(c0, c3); a2 = vis_fpsub16(c1, c2); a1 = vis_fpadd16(c1, c2); c1 = vis_fpadd16(a3, a2); c3 = vis_fpsub16(a3, a2); b1 = vis_fpadd16(vis_fmul8x16(a, c1), vis_fmul8x16(b, c3)); b3 = vis_fpsub16(vis_fmul8x16(a, c3), vis_fmul8x16(b, c1)); b0 = vis_fpadd16(a0, a1); b2 = vis_fpsub16(a0, a1); b0 = vis_fmul8x16(c, b0); b2 = vis_fmul8x16(c, b2); b1 = vis_fmul8x16(c, b1); b3 = vis_fmul8x16(c, b3); TRANSPOSE_VIS2(b0, b1, b2, b3, dst[0], dst[1], dst[2], dst[3]); return (MLIB_SUCCESS); }
void mlib_v_ImageAffineTableLine_8nw_2_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_2x2(row00, row10); FILTER_MERGE; MAKE_2x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; #pragma pipeloop(0) for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, fy0); xFilter = vis_write_lo(xFilter, fx0); row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1)); row10 = vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr, srcStride + 1)); v0 = vis_fmul8x16au(vis_read_lo(row00), fy0); v1 = vis_fmul8x16al(vis_read_lo(row10), fy0); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_f32 *pd = (mlib_f32 *)dst; mlib_d64 d0; mlib_d64 tmp0, tmp1, data; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, bias = 0; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { d0 = (*sp0++); tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0)); tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0)); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16(data); } dst = (mlib_u8 *)pd; for (; i < n; i += 2) { (*dst++) = (src[i] + src[i + 1] + bias) >> 1; /* 1=>2, 2=>1 */ bias ^= 1; } return (MLIB_SUCCESS); }
static void FUNC( m2) ( FUNC_M_ARG) { mlib_s32 i; mlib_d64 k0 = pkern[0]; mlib_d64 k1 = pkern[1]; mlib_d64 a0, a1, aa, sum; mlib_d64 *perror = vis_alignaddr(perror1, 0); a0 = (*perror++); for (i = 0; i < (sw + 3) / 4; i++) { aa = (*perror++); a1 = vis_faligndata(a0, aa); sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0)); buffd[i] = vis_fpadd16(sum, FMUL_16x16(k1, a1)); a0 = aa; } }
mlib_status __mlib_VideoAddBlock_U8_S16( mlib_u8 *curr_block, const mlib_s16 *mc_block, mlib_s32 stride) { mlib_s32 y; mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd; mlib_f32 zeros = vis_fzeros(); /* * mlib_s32 mlib_imult = 0x100; * mlib_f32 mult = *(mlib_f32*) & mlib_imult; */ mlib_f32 mult = vis_to_float(0x100); vis_write_gsr(7 << 3); dp = (mlib_d64 *)curr_block; sp = (mlib_d64 *)mc_block; #pragma pipeloop(0) for (y = 0; y < 8; y++) { dd = *dp; s1hi = (*sp++); s1lo = (*sp++); s2hi = vis_fpmerge(zeros, vis_read_hi(dd)); s2lo = vis_fmul8x16al(vis_read_lo(dd), mult); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); *dp = vis_fpack16_pair(s1hi, s1lo); dp = (mlib_d64 *)((mlib_u8 *)dp + stride); } return (MLIB_SUCCESS); }
static void FUNC( m1) ( FUNC_M_ARG) { mlib_s32 i; mlib_d64 k0 = pkern[0]; mlib_d64 a0, e0, e1; mlib_d64 *perror = vis_alignaddr(perror1, 0); e0 = (*perror++); for (i = 0; i < (sw + 3) / 4; i++) { e1 = (*perror++); a0 = vis_faligndata(e0, e1); buffd[i] = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0)); e0 = e1; } }
mlib_status __mlib_VideoInterpAveX_U8_U8_16x16( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_d64 s0, s1, s2, s3, s4, s5, s6; mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3; mlib_d64 *sd, *dd; mlib_d64 dzero = vis_fzero(); const mlib_f32 fm2 = vis_to_float(0x1000200); mlib_f32 fzero = vis_read_hi(dzero); mlib_d64 rounder = vis_fpsub16(dzero, vis_fone()); mlib_s32 y; rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder); vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7)); dd = (mlib_d64 *)curr_block; sd = (mlib_d64 *)((mlib_addr)ref_block & ~7); y = 8; if (((mlib_s32)(ref_block + 1) & 7)) { do { s0 = sd[0]; s1 = sd[1]; s2 = sd[2]; sd0 = vis_faligndata(s0, s1); sd1 = vis_faligndata(s1, s2); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); s4 = sd[0]; s5 = sd[1]; s6 = sd[2]; sd2 = vis_faligndata(s4, s5); sd3 = vis_faligndata(s5, s6); vis_alignaddr((void *)(ref_block + 1), 0); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = dd[0]; d1 = dd[1]; d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0]; d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1]; s0 = vis_faligndata(s0, s1); s1 = vis_faligndata(s1, s2); s2 = vis_faligndata(s4, s5); s3 = vis_faligndata(s5, s6); MLIB_V_VIDEOINTERPAVG(d0, sd0, s0); MLIB_V_VIDEOINTERPAVG(d1, sd1, s1); MLIB_V_VIDEOINTERPAVG(d2, sd2, s2); MLIB_V_VIDEOINTERPAVG(d3, sd3, s3); dd[0] = d0; dd[1] = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); dd[0] = d2; dd[1] = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); vis_alignaddr((void *)ref_block, 0); } while (--y); } else { do { s0 = sd[0]; s1 = sd[1]; s2 = sd[2]; sd0 = vis_faligndata(s0, s1); sd1 = vis_faligndata(s1, s2); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); s4 = sd[0]; s5 = sd[1]; s6 = sd[2]; sd2 = vis_faligndata(s4, s5); sd3 = vis_faligndata(s5, s6); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = dd[0]; d1 = dd[1]; d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0]; d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1]; MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1); MLIB_V_VIDEOINTERPAVG(d1, sd1, s2); MLIB_V_VIDEOINTERPAVG(d2, sd2, s5); MLIB_V_VIDEOINTERPAVG(d3, sd3, s6); dd[0] = d0; dd[1] = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); dd[0] = d2; dd[1] = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); } while (--y); } return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2ABGR422_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp1, *sl1; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo; mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo; mlib_d64 dd, dd0, dd1; /* loop variable */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_s32 off; mlib_f32 *dfu, *dfv; mlib_d64 du0, du1, dv0, dv1; mlib_s32 off2, off3; mlib_s32 inc; /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); i = dp - (mlib_u8 *)dpp; emask >>= i; inc = (emask1 != 0xff); emask1 &= emask; off = 8 - i; vis_alignaddr((void *)off2, 0); fu0 = vis_ld_f32_nf(dfu); dfu++; fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = vis_ld_f32_nf(dfv); dfv++; fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); vis_alignaddr(sp1, 0); dy0 = vis_ld_d64_nf(spy); spy++; dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; vis_alignaddr((void *)off, 0); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_lo), vis_read_lo(blue_red_lo)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); vis_alignaddr(sp1, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; emask1 = emask; } if (i < width) { vis_alignaddr((void *)off, 0); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dd0 = dd1; dpp += inc; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_hi (x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; } } } vis_alignaddr((void *)off, 0); emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 fzero = vis_fzeros(); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_u8 c; mlib_d64 dsp = vis_to_double_dup(0x800080); mlib_d64 rst = vis_to_double_dup(0x80808080); mlib_f32 fm = vis_to_float(0x100); if (length < 16) { PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d4 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); d2 = vis_fpmerge(fzero, vis_read_hi(d4)); d3 = vis_fmul8x16al(vis_read_lo(d4), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); (*ddst++) = vis_fxor(d4, rst); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d1 = vis_faligndata(d1, d2); d3 = vis_fmul8x16al(vis_read_hi(d1), fm); d4 = vis_fmul8x16al(vis_read_lo(d1), fm); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpadd16(dsp, d4); d1 = vis_fpack16_pair(d3, d4); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_faligndata(d1, d2); d4 = vis_fmul8x16al(vis_read_hi(d3), fm); d5 = vis_fmul8x16al(vis_read_lo(d3), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d3 = vis_fpack16_pair(d4, d5); d4 = vis_fmul8x16al(vis_read_hi(d6), fm); d5 = vis_fmul8x16al(vis_read_lo(d6), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d6 = vis_fpack16_pair(d4, d5); (*ddst++) = vis_fxor(d3, rst); (*ddst++) = vis_fxor(d6, rst); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c; return (MLIB_SUCCESS); }
void mlib_v_ImageAffineTableLine_8nw_2_2_4( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; mlib_f32 yFilter; i = 0; if (i <= size - 6) { CALC_SRC_PTR; LOAD_1PIXEL_2x2; CALC_SRC_PTR; MAKE_2x2(0); MAKE_2x2(1); FADD_4x2; MAKE_2x2(0); MAKE_2x2(1); #pragma pipeloop(0) for (; i <= size - 8; i += 2) { *buff = res0; buff++; *buff = res1; buff++; FADD_4x2; MAKE_2x2(0); MAKE_2x2(1); } *buff = res0; buff++; *buff = res1; buff++; FADD_4x2; *buff = res0; buff++; *buff = res1; buff++; RESULT_1PIXEL_2x2(0); LOAD_1PIXEL_2x2; RESULT_1PIXEL_2x2(1); FADD_4x2; *buff = res0; buff++; *buff = res1; buff++; i += 6; } #pragma pipeloop(0) for (; i < size; i++) { CALC_SRC_PTR; LOAD_1PIXEL_2x2; RESULT_1PIXEL_2x2(0); res0 = vis_fpadd16(d00, d10); *buff = res0; buff++; } }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_d64 *pd = (mlib_d64 *)dst; mlib_d64 d0; mlib_d64 tmp, data0, data1; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, edge; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); vis_write_bmask(0x02461357, 0); #pragma pipeloop(0) for (i = 0; i <= n - 16; i += 16) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16_pair(data0, data1); } if (i < n) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = vis_ld_d64_nf(sp0); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); edge = vis_edge8(pd, (dst + (n / 2) - 1)); vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge); } return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2RGB444_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 yuv_stride) { /* all. pointer to y, u, v */ mlib_d64 *spy, *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0); mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4); mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2; /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_u8 *tmp, *dp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); if (tmp == NULL) return (MLIB_FAILURE); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; for (j = 0; j < height; j++) { dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0); du0 = (*dfu++); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0); dv0 = (*dfv++); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); spy = (mlib_d64 *)vis_alignaddr((void *)y, 0); dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd0 = vis_bshuffle(red, green); vis_write_bmask(0x03B04C05, 0); dd1 = vis_bshuffle(red, green); vis_write_bmask(0xD06E07F0, 0); dd2 = vis_bshuffle(red, green); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd0, blue); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd1, blue); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd2, blue); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; ddp += 3; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); blue = vis_faligndata(blue, blue); green = vis_faligndata(green, green); red = vis_faligndata(red, red); dp += ((width - i - 1) * 3); vis_alignaddr((void *)spy, 7); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } __mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3); rgb += rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; y += yuv_stride; u += yuv_stride; v += yuv_stride; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoInterpAveX_U8_U8( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 width, mlib_s32 height, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7; mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3; mlib_d64 *sd, *dd; mlib_d64 dzero = vis_fzero(); const mlib_f32 fm2 = vis_to_float(0x1000200); mlib_f32 fzero = vis_read_hi(dzero); mlib_d64 rounder = vis_fpsub16(dzero, vis_fone()); mlib_s32 y; rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder); vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7)); dd = (mlib_d64 *)curr_block; sd = (mlib_d64 *)((mlib_addr)ref_block & ~7); if (width == 8) { y = height >> 2; if (((mlib_s32)(ref_block + 1) & 7)) { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd3 = vis_faligndata(s6, s7); vis_alignaddr((void *)(ref_block + 1), 0); d0 = *dd; d1 = *(mlib_d64 *)((mlib_u8 *)dd + field_stride); d2 = *(mlib_d64 *)((mlib_u8 *)dd + 2 * field_stride); d3 = *(mlib_d64 *)((mlib_u8 *)dd + 3 * field_stride); s0 = vis_faligndata(s0, s1); s1 = vis_faligndata(s2, s3); s2 = vis_faligndata(s4, s5); s3 = vis_faligndata(s6, s7); MLIB_V_VIDEOINTERPAVG(d0, sd0, s0); MLIB_V_VIDEOINTERPAVG(d1, sd1, s1); MLIB_V_VIDEOINTERPAVG(d2, sd2, s2); MLIB_V_VIDEOINTERPAVG(d3, sd3, s3); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); vis_alignaddr((void *)ref_block, 0); } while (--y); } else { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd3 = vis_faligndata(s6, s7); d0 = *dd; d1 = *(mlib_d64 *)((mlib_u8 *)dd + field_stride); d2 = *(mlib_d64 *)((mlib_u8 *)dd + 2 * field_stride); d3 = *(mlib_d64 *)((mlib_u8 *)dd + 3 * field_stride); MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1); MLIB_V_VIDEOINTERPAVG(d1, sd1, s3); MLIB_V_VIDEOINTERPAVG(d2, sd2, s5); MLIB_V_VIDEOINTERPAVG(d3, sd3, s7); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); } while (--y); } } else {
static mlib_status mlib_v_VideoColorYUV2RGB420_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp11, *sp12, *sl11, *sl12; /* pointers to dst address */ mlib_u8 *dp1, *dl1; /* pointers to dst address */ mlib_u8 *dp2, *dl2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variables */ mlib_s32 i, j; mlib_s32 y_stride2 = 2 * y_stride; mlib_s32 rgb_stride2 = 2 * rgb_stride; mlib_s32 off2, off3; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; mlib_d64 *buf1, BUFF1[16 * 1024]; mlib_d64 *buf2, BUFF2[16 * 1024]; mlib_u8 *tmp1, *tmp2; if (width * 3 > 16 * 1024) { tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7); buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7); } else { buf1 = (mlib_d64 *)BUFF1; buf2 = (mlib_d64 *)BUFF2; } /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp11 = sl11 = (mlib_u8 *)y; sp12 = sl12 = (mlib_u8 *)y + y_stride; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; dl1 = (mlib_u8 *)rgb; dl2 = (mlib_u8 *)(rgb + rgb_stride); ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; /* * row loop */ for (j = 0; j < height / 2; j++) { spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0); spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; vis_alignaddr((void *)off2, 0); fu0 = (*dfu++); fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = (*dfv++); fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy0 = (*spy1++); dy4 = (*spy2++); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(width - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((width - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } sp11 = sl11 = sl11 + y_stride2; sp12 = sl12 = sl12 + y_stride2; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3); __mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3); dl1 = dp1 = dl1 + rgb_stride2; dl2 = dp2 = dl2 + rgb_stride2; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; } if (width * 3 > 16 * 1024) { __mlib_free(tmp1); __mlib_free(tmp2); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB420_Nearest( mlib_u8 *rgb0, mlib_u8 *rgb1, const mlib_u8 *y0, const mlib_u8 *y1, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 n) { /* pointers to dst address */ mlib_u8 *dp1, *dp2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 du, dv; /* (1.00000, 1.40200)*8192 */ mlib_f32 k12 = vis_to_float(0x20002cdd); /* (-.34414, -.71414)*8192 */ mlib_f32 k34 = vis_to_float(0xf4fde926); /* 1.77200*8192 */ mlib_f32 k5 = vis_to_float(0x10038b4); /* (179.45600 - 0.5)*32 */ mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f); /* (135.45984 + 0.5)*32 */ mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff); /* (226.81600 - 0.5)*32 */ mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variable */ mlib_s32 i; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; if (n <= 0) return (MLIB_FAILURE); /* * initialize GSR scale factor */ vis_write_gsr((2 << 3) + 7); dp1 = (mlib_u8 *)rgb0; dp2 = (mlib_u8 *)rgb1; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; spy1 = (mlib_d64 *)y0; spy2 = (mlib_d64 *)y1; dfu = (mlib_f32 *)cb; dfv = (mlib_f32 *)cr; fu = vis_ld_f32_nf(dfu); dfu++; fv = vis_ld_f32_nf(dfv); dfv++; du = vis_fpmerge(fu, fu); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(n - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((n - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((n - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < n; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoH263OverlappedMC_S16_U8( mlib_s16 mc_block[64], const mlib_u8 *ref_frame, mlib_s32 mch, mlib_s32 mcv, mlib_s32 mah, mlib_s32 mav, mlib_s32 mbh, mlib_s32 mbv, mlib_s32 mlh, mlib_s32 mlv, mlib_s32 mrh, mlib_s32 mrv, mlib_s32 ref_stride) { mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9; mlib_d64 d10, d11, d12, d13, d14, d15; mlib_d64 tmp1, tmp2, tmp3; mlib_d64 dmask = vis_fexpand(vis_fones()); mlib_d64 denom = vis_fandnot(dmask, vis_fpadd16(dmask, dmask)); mlib_f32 reg_H0_00, reg_H0_01, reg_H0_10, reg_H0_20, reg_H0_21; mlib_f32 reg_H1_00, reg_H1_10, reg_H1_11, reg_H1_20, reg_H2_00; mlib_f32 reg_H2_01, reg_H2_10, reg_H2_11; mlib_f32 frnd; mlib_d64 *dp, *sd; const mlib_u8 *sp1, *sp2, *sp3, *sp4, *sp5; mlib_s32 ref_stride2 = ref_stride << 1, off; sp1 = (ref_frame + mch + mcv * ref_stride); sp2 = (ref_frame + mah + mav * ref_stride); sp3 = (ref_frame + mlh + mlv * ref_stride); sp4 = (ref_frame + mrh + 8 + mrv * ref_stride); sp5 = (ref_frame + mbh + (mbv + 8) * ref_stride); dp = (mlib_d64 *)mc_block; reg_H0_00 = vis_to_float(0x40505050); reg_H0_01 = vis_to_float(0x50505040); reg_H0_10 = vis_to_float(0x50505050); reg_H0_20 = vis_to_float(0x50506060); reg_H0_21 = vis_to_float(0x60605050); frnd = vis_to_float(0x20202020); /* * central */ sd = (mlib_d64 *)vis_alignaddr((void *)sp1, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d0, tmp1, reg_H0_00); ACCSET(d1, tmp2, reg_H0_01); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d2, tmp1, reg_H0_10); ACCSET(d3, tmp2, reg_H0_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d4, tmp1, reg_H0_20); ACCSET(d5, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d6, tmp1, reg_H0_20); ACCSET(d7, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d8, tmp1, reg_H0_20); ACCSET(d9, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d10, tmp1, reg_H0_20); ACCSET(d11, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d12, tmp1, reg_H0_10); ACCSET(d13, tmp2, reg_H0_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d14, tmp1, reg_H0_00); ACCSET(d15, tmp2, reg_H0_01); /* * left */ reg_H2_00 = vis_to_float(0x20101010); reg_H2_01 = vis_to_float(0x10101020); reg_H2_10 = vis_to_float(0x20201010); reg_H2_11 = vis_to_float(0x10102020); off = (mlib_addr)sp3 & 7; sd = (mlib_d64 *)((mlib_u8 *)sp3 - off); vis_write_bmask(0x11111111 * off + 0x01234567, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d0, tmp1, reg_H2_00); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d2, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d4, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d6, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d8, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d10, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d12, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = vis_ld_d64_nf(sd + 1); tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d14, tmp1, reg_H2_00); /* * right */ sd = (mlib_d64 *)vis_alignaddr((void *)sp4, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d1, tmp1, reg_H2_01); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d3, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d5, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d7, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d9, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d11, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d13, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = vis_ld_d64_nf(sd + 1); tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d15, tmp1, reg_H2_01); /* * above */ reg_H1_10 = vis_to_float(0x10102020); reg_H1_11 = vis_to_float(0x20201010); reg_H1_20 = vis_to_float(0x10101010); off = (mlib_addr)sp2 & 7; sd = (mlib_d64 *)((mlib_u8 *)sp2 - off); vis_write_bmask(0x11111111 * off + 0x01234567, 0); reg_H1_00 = vis_to_float(0x20202020); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[0], d0, tmp1, reg_H1_00); ACCPUT(dp[1], d1, tmp2, reg_H1_00); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[2], d2, tmp1, reg_H1_10); ACCPUT(dp[3], d3, tmp2, reg_H1_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[4], d4, tmp1, reg_H1_20); ACCPUT(dp[5], d5, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[6], d6, tmp1, reg_H1_20); ACCPUT(dp[7], d7, tmp2, reg_H1_20); /* * below */ sd = (mlib_d64 *)vis_alignaddr((void *)sp5, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[8], d8, tmp1, reg_H1_20); ACCPUT(dp[9], d9, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[10], d10, tmp1, reg_H1_20); ACCPUT(dp[11], d11, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[12], d12, tmp1, reg_H1_10); ACCPUT(dp[13], d13, tmp2, reg_H1_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[14], d14, tmp1, reg_H1_00); ACCPUT(dp[15], d15, tmp2, reg_H1_00); return (MLIB_SUCCESS); }
static void mlib_v_VideoYUV2ABGR_aarray_411( mlib_u32 *abgr, const mlib_d64 *y, const mlib_f32 *u, const mlib_f32 *v, const mlib_d64 *a_array, mlib_s32 count, mlib_s32 left, mlib_s32 isrgb) { /* all. pointer to dst */ mlib_d64 *dpp = (mlib_d64 *)abgr; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1; mlib_d64 dv1, dv2; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 *dpa, da0, da1, da2, da3, da4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variables */ mlib_s32 i; if (isrgb) { f0 = vis_to_float(0x12a1); f1 = vis_to_float(0x3317); f4 = vis_to_float(0xe5fa); f5 = vis_to_float(0xf375); f8 = vis_to_float(0x4097); doff0 = vis_to_double_dup(0xe420e420); doff1 = vis_to_double_dup(0x10f410f4); doff2 = vis_to_double_dup(0xdd60dd60); } dpa = vis_alignaddr((void *)a_array, 0); dy1 = (*y++); dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++; fu = (*u++); fv = (*v++); da2 = (*dpa++); da3 = vis_ld_d64_nf(dpa); dpa++; da4 = vis_ld_d64_nf(dpa); dpa++; du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); if (!((mlib_addr)abgr & 7)) { #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dpp[8 * i] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dpp[8 * i + 2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dpp[8 * i + 4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dpp[8 * i + 6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } else { mlib_d64 dd; #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } if (left) { mlib_d64 res_buf[8]; da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); for (i = 0; i < left; i++) ((mlib_f32 *)dpp)[16 * count + i] = ((mlib_f32 *)res_buf)[i]; } }
mlib_status __mlib_VideoIDCT8x8_S16_S16_B12( mlib_s16 *block, const mlib_s16 *coeffs) { mlib_d64 *dPtr = (mlib_d64 *)coeffs; mlib_d64 *outPtr = (mlib_d64 *)block; mlib_d64 dx0, dx1, dx2, dx3, dx4, dx6, dx7, dx8; mlib_d64 p00, p10, p20, p30, p40, p50, p60, p70, p01, p11, p21, p31, p41, p51, p61, p71; mlib_d64 t0, t1; mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7; mlib_f32 COS_1_16; mlib_f32 COS_2_16; mlib_f32 COS_6_16; mlib_f32 COS_7_16; mlib_f32 COS_4_16; mlib_f32 C_1_4; /* First pass */ vis_write_bmask(0x018923ab, 0x0); LOAD_DATA_AA1 COS_1_16 = ((mlib_f32 *)mlib_cTable)[0]; COS_2_16 = ((mlib_f32 *)mlib_cTable)[1]; COS_6_16 = ((mlib_f32 *)mlib_cTable)[2]; COS_7_16 = ((mlib_f32 *)mlib_cTable)[3]; COS_4_16 = ((mlib_f32 *)mlib_cTable)[4]; C_1_4 = ((mlib_f32 *)mlib_cTable)[5]; TRANSPOSE_VIS2(p00, p10, p20, p30, d0, d1, d2, d3) TRANSPOSE_VIS2(p01, p11, p21, p31, d4, d5, d6, d7) LOAD_DATA_AA2 IDCT(d0, d1, d2, d3, d4, d5, d6, d7) TRANSPOSE_VIS2(p40, p50, p60, p70, d0, d1, d2, d3) p00 = vis_fpadd16(dx7, dx1); p10 = vis_fpadd16(dx3, dx2); p20 = vis_fpadd16(dx0, dx4); p30 = vis_fpadd16(dx8, dx6); p01 = vis_fpsub16(dx8, dx6); p11 = vis_fpsub16(dx0, dx4); p21 = vis_fpsub16(dx3, dx2); p31 = vis_fpsub16(dx7, dx1); TRANSPOSE_VIS2(p41, p51, p61, p71, d4, d5, d6, d7) IDCT(d0, d1, d2, d3, d4, d5, d6, d7) TRANSPOSE_VIS2(p00, p10, p20, p30, d0, d1, d2, d3) p40 = vis_fpadd16(dx7, dx1); p50 = vis_fpadd16(dx3, dx2); p60 = vis_fpadd16(dx0, dx4); p70 = vis_fpadd16(dx8, dx6); p41 = vis_fpsub16(dx8, dx6); p51 = vis_fpsub16(dx0, dx4); p61 = vis_fpsub16(dx3, dx2); p71 = vis_fpsub16(dx7, dx1); /* Second pass */ TRANSPOSE_VIS2(p40, p50, p60, p70, d4, d5, d6, d7) IDCT(d0, d1, d2, d3, d4, d5, d6, d7) TRANSPOSE_VIS2(p01, p11, p21, p31, d0, d1, d2, d3) outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); TRANSPOSE_VIS2(p41, p51, p61, p71, d4, d5, d6, d7) IDCT(d0, d1, d2, d3, d4, d5, d6, d7) outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoInterpX_S16_U8( mlib_s16 *mc_block, const mlib_u8 *ref_block, mlib_s32 width, mlib_s32 height, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_s32 y; mlib_d64 *dd, *s0, ss0[MAXH * MAXW], *sp1, s1hi, s1lo, s2hi, s2lo, s2, s3; mlib_f32 strunc = vis_read_hi(*(mlib_d64 *)mlib_IX16const); mlib_f32 fexpd = vis_read_lo(*(mlib_d64 *)mlib_IX16const); dd = (mlib_d64 *)mc_block; sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0); s0 = ss0; if (width == 8) { #pragma pipeloop(0) for (y = 0; y < height; y++) { s1hi = sp1[0]; s1lo = sp1[1]; *(s0++) = vis_faligndata(s1hi, s1lo); sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride); } sp1 = (mlib_d64 *)vis_alignaddr((void *)(ref_block + 1), 0); s0 = ss0; #pragma pipeloop(0) for (y = 0; y < height; y++) { s2hi = sp1[0]; s2lo = vis_ld_d64_nf(sp1 + 1); s2 = vis_faligndata(s2hi, s2lo); s1hi = vis_fexpand(vis_read_hi(*s0)); s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd); s2hi = vis_fexpand(vis_read_hi(s2)); s2lo = vis_fmul8x16al(vis_read_lo(s2), fexpd); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); s0++; dd[0] = vis_fmul8x16(strunc, s1hi); dd[1] = vis_fmul8x16(strunc, s1lo); sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride); dd = dd + 2; } } else { /* if(width == 16) */ #pragma pipeloop(0) for (y = 0; y < height; y++) { s1hi = sp1[0]; s1lo = sp1[1]; s2 = sp1[2]; *(s0++) = vis_faligndata(s1hi, s1lo); *(s0++) = vis_faligndata(s1lo, s2); sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride); } sp1 = (mlib_d64 *)vis_alignaddr((void *)(ref_block + 1), 0); s0 = ss0; #pragma pipeloop(0) for (y = 0; y < height; y++) { s2hi = sp1[0]; s2lo = sp1[1]; s3 = vis_ld_d64_nf(sp1 + 2); s2 = vis_faligndata(s2hi, s2lo); s3 = vis_faligndata(s2lo, s3); s1hi = vis_fexpand(vis_read_hi(*s0)); s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd); s2hi = vis_fexpand(vis_read_hi(s2)); s2lo = vis_fmul8x16al(vis_read_lo(s2), fexpd); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); s0++; dd[0] = vis_fmul8x16(strunc, s1hi); dd[1] = vis_fmul8x16(strunc, s1lo); s1hi = vis_fexpand(vis_read_hi(*s0)); s1lo = vis_fmul8x16al(vis_read_lo(*s0), fexpd); s2hi = vis_fexpand(vis_read_hi(s3)); s2lo = vis_fmul8x16al(vis_read_lo(s3), fexpd); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); s0++; dd[2] = vis_fmul8x16(strunc, s1hi); dd[3] = vis_fmul8x16(strunc, s1lo); sp1 = (mlib_d64 *)((mlib_u8 *)sp1 + field_stride); dd = dd + 4; } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
mlib_u8 *dst, const mlib_u8 *src, mlib_s32 width, mlib_s32 height, mlib_s32 dst_stride, mlib_s32 src_stride) { mlib_s32 x, y, x4 = width >> 2; mlib_d64 *sl1, *sl2, s1hi, s1lo, s2hi, s2lo, s1, s2; mlib_d64 done = vis_to_double_dup(0x1000100); mlib_d64 dmask; mlib_f32 *dp; mlib_f32 frnd = vis_to_float(0x40404040); mlib_s32 src_stride2 = 2 * src_stride; dmask = vis_fpadd16(done, vis_fone()); vis_write_gsr(7 << 3); sl1 = (mlib_d64 *)src; sl2 = (mlib_d64 *)(src + src_stride); dp = (mlib_f32 *)dst; for (y = 0; y < height; y++) { #pragma pipeloop(0) for (x = 0; x < x4; x++) { s1 = sl1[x]; s2 = sl2[x]; s1lo = vis_fand(s1, dmask); s1hi = vis_fmul8sux16(s1, done); s2lo = vis_fand(s2, dmask); s2hi = vis_fmul8sux16(s2, done); s1lo = vis_fpadd16(s1lo, s2lo);
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDCT16x16_S16_S16( mlib_s16 *coeffs, const mlib_s16 *block) { mlib_s32 j; mlib_d64 val_m[16 * 4]; mlib_d64 b0, b1, b2, b3, b4, b5, b6, b7, b8, b9; mlib_d64 b10, b11, b12, b13, b14, b15; mlib_d64 t0, t1, t2, t3, t4, t5, t6, t7, t9; mlib_d64 t10, t11, t12, t13, t14; mlib_d64 m02, m13, m0213, p0, p1, p2, p3; mlib_d64 c1, c2, c3, c4; mlib_f32 COS_4 = ((mlib_f32 *)mlib_dct16vtab)[0]; mlib_f32 SIN_8 = ((mlib_f32 *)mlib_dct16vtab)[1]; mlib_f32 COS_8 = ((mlib_f32 *)mlib_dct16vtab)[2]; mlib_f32 SIN_16 = ((mlib_f32 *)mlib_dct16vtab)[3]; mlib_f32 COS_16 = ((mlib_f32 *)mlib_dct16vtab)[4]; mlib_f32 COS_3_16 = ((mlib_f32 *)mlib_dct16vtab)[5]; mlib_f32 SIN_3_16 = ((mlib_f32 *)mlib_dct16vtab)[6]; mlib_f32 SIN_32 = ((mlib_f32 *)mlib_dct16vtab)[7]; mlib_f32 COS_32 = ((mlib_f32 *)mlib_dct16vtab)[8]; mlib_f32 COS_3_32 = ((mlib_f32 *)mlib_dct16vtab)[9]; mlib_f32 SIN_3_32 = ((mlib_f32 *)mlib_dct16vtab)[10]; mlib_f32 COS_5_32 = ((mlib_f32 *)mlib_dct16vtab)[11]; mlib_f32 SIN_5_32 = ((mlib_f32 *)mlib_dct16vtab)[12]; mlib_f32 COS_7_32 = ((mlib_f32 *)mlib_dct16vtab)[13]; mlib_f32 SIN_7_32 = ((mlib_f32 *)mlib_dct16vtab)[14]; mlib_f32 fscale = ((mlib_f32 *)mlib_dct16vtab)[15]; mlib_d64 *bptr = (mlib_d64 *)block; mlib_d64 *coeffs64 = (mlib_d64 *)coeffs; /* * first column based 1-D 16x16 DCT */ #pragma pipeloop(0) for (j = 0; j < 4; j++) { /* * first butter-fly */ b0 = vis_fpadd16(bptr[j], bptr[j + 4 * 15]); b15 = vis_fpsub16(bptr[j], bptr[j + 4 * 15]); b1 = vis_fpadd16(bptr[j + 4 * 1], bptr[j + 4 * 14]); b14 = vis_fpsub16(bptr[j + 4 * 1], bptr[j + 4 * 14]); b2 = vis_fpadd16(bptr[j + 4 * 2], bptr[j + 4 * 13]); b13 = vis_fpsub16(bptr[j + 4 * 2], bptr[j + 4 * 13]); b3 = vis_fpadd16(bptr[j + 4 * 3], bptr[j + 4 * 12]); b12 = vis_fpsub16(bptr[j + 4 * 3], bptr[j + 4 * 12]); b4 = vis_fpadd16(bptr[j + 4 * 4], bptr[j + 4 * 11]); b11 = vis_fpsub16(bptr[j + 4 * 4], bptr[j + 4 * 11]); b5 = vis_fpadd16(bptr[j + 4 * 5], bptr[j + 4 * 10]); b10 = vis_fpsub16(bptr[j + 4 * 5], bptr[j + 4 * 10]); b6 = vis_fpadd16(bptr[j + 4 * 6], bptr[j + 4 * 9]); b9 = vis_fpsub16(bptr[j + 4 * 6], bptr[j + 4 * 9]); b7 = vis_fpadd16(bptr[j + 4 * 7], bptr[j + 4 * 8]); b8 = vis_fpsub16(bptr[j + 4 * 7], bptr[j + 4 * 8]); /* * second butter-fly */ t0 = vis_fpadd16(b0, b7); t1 = vis_fpadd16(b1, b6); t2 = vis_fpadd16(b2, b5); t3 = vis_fpadd16(b3, b4); t4 = vis_fpsub16(b3, b4); t5 = vis_fpsub16(b2, b5); t6 = vis_fpsub16(b1, b6); t7 = vis_fpsub16(b0, b7); c1 = vis_fpsub16(b13, b10); c2 = vis_fpsub16(b12, b11); c3 = vis_fpadd16(b11, b12); c4 = vis_fpadd16(b10, b13); t10 = vis_fmul8x16(FCOS_4, c1); t11 = vis_fmul8x16(FCOS_4, c2); t12 = vis_fmul8x16(FCOS_4, c3); t13 = vis_fmul8x16(FCOS_4, c4); /* * third butter-fly */ b0 = vis_fpadd16(t0, t3); b1 = vis_fpadd16(t1, t2); b2 = vis_fpsub16(t1, t2); b3 = vis_fpsub16(t0, t3); c1 = vis_fpsub16(t6, t5); c2 = vis_fpadd16(t6, t5); b5 = vis_fmul8x16(FCOS_4, c1); b6 = vis_fmul8x16(FCOS_4, c2); b11 = vis_fpsub16(b8, t11); b8 = vis_fpadd16(b8, t11); b10 = vis_fpsub16(b9, t10); b9 = vis_fpadd16(b9, t10); b12 = vis_fpsub16(b15, t12); b13 = vis_fpsub16(b14, t13); b14 = vis_fpadd16(b14, t13); b15 = vis_fpadd16(b15, t12); /* * fourth butter-fly */ c1 = vis_fpadd16(b0, b1); c2 = vis_fpsub16(b0, b1); p0 = vis_fmul8x16(COS_4, c1); p2 = vis_fmul8x16(COS_4, c2); c1 = vis_fmul8x16(SIN_8, b2); c2 = vis_fmul8x16(COS_8, b3); c3 = vis_fmul8x16(SIN_8, b3); c4 = vis_fmul8x16(COS_8, b2); p1 = vis_fpadd16(c1, c2); p3 = vis_fpsub16(c3, c4); MLIB_PTRANSPOSE16_4x4(val_m, 16 * j); t5 = vis_fpsub16(t4, b5); t4 = vis_fpadd16(t4, b5); t6 = vis_fpsub16(t7, b6); t7 = vis_fpadd16(t7, b6); c1 = vis_fmul8x16(FSIN_8, b14); c2 = vis_fmul8x16(FCOS_8, b9); c3 = vis_fmul8x16(FSIN_8, b10); c4 = vis_fmul8x16(FCOS_8, b13); t9 = vis_fpsub16(c1, c2); t10 = vis_fpadd16(c3, c4); c1 = vis_fmul8x16(FSIN_8, b13); c2 = vis_fmul8x16(FCOS_8, b10); c3 = vis_fmul8x16(FSIN_8, b9); c4 = vis_fmul8x16(FCOS_8, b14); t13 = vis_fpsub16(c1, c2); t14 = vis_fpadd16(c3, c4); /* * fifth butter-fly */ c1 = vis_fmul8x16(SIN_16, t4); c2 = vis_fmul8x16(COS_16, t7); c3 = vis_fmul8x16(COS_3_16, t5); c4 = vis_fmul8x16(SIN_3_16, t6); p0 = vis_fpadd16(c1, c2); p2 = vis_fpadd16(c3, c4); c1 = vis_fmul8x16(COS_3_16, t6); c2 = vis_fmul8x16(SIN_3_16, t5); c3 = vis_fmul8x16(SIN_16, t7); c4 = vis_fmul8x16(COS_16, t4); p1 = vis_fpsub16(c1, c2); p3 = vis_fpsub16(c3, c4); MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 2); b9 = vis_fpsub16(b8, t9); b8 = vis_fpadd16(b8, t9); b10 = vis_fpadd16(b11, t10); b11 = vis_fpsub16(b11, t10); b13 = vis_fpsub16(b12, t13); b12 = vis_fpadd16(b12, t13); b14 = vis_fpsub16(b15, t14); b15 = vis_fpadd16(b15, t14); /* * sixth butter-fly */ c1 = vis_fmul8x16(SIN_32, b8); c2 = vis_fmul8x16(COS_32, b15); c3 = vis_fmul8x16(COS_7_32, b9); c4 = vis_fmul8x16(SIN_7_32, b14); p0 = vis_fpadd16(c1, c2); p2 = vis_fpadd16(c3, c4); c1 = vis_fmul8x16(SIN_5_32, b10); c2 = vis_fmul8x16(COS_5_32, b13); c3 = vis_fmul8x16(COS_3_32, b11); c4 = vis_fmul8x16(SIN_3_32, b12); p1 = vis_fpadd16(c1, c2); p3 = vis_fpadd16(c3, c4); MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 1); c1 = vis_fmul8x16(COS_3_32, b12); c2 = vis_fmul8x16(SIN_3_32, b11); c3 = vis_fmul8x16(SIN_5_32, b13); c4 = vis_fmul8x16(COS_5_32, b10); p0 = vis_fpsub16(c1, c2); p2 = vis_fpsub16(c3, c4); c1 = vis_fmul8x16(COS_7_32, b14); c2 = vis_fmul8x16(SIN_7_32, b9); c3 = vis_fmul8x16(SIN_32, b15); c4 = vis_fmul8x16(COS_32, b8); p1 = vis_fpsub16(c1, c2); p3 = vis_fpsub16(c3, c4); MLIB_PTRANSPOSE16_4x4(val_m, 16 * j + 3); } /* * then row based 1-D 16x16 DCT */ #pragma pipeloop(0) for (j = 0; j < 4; j++) { /* * first butter-fly */ b0 = vis_fpadd16(val_m[j], val_m[j + 4 * 15]); b15 = vis_fpsub16(val_m[j], val_m[j + 4 * 15]); b1 = vis_fpadd16(val_m[j + 4 * 1], val_m[j + 4 * 14]); b14 = vis_fpsub16(val_m[j + 4 * 1], val_m[j + 4 * 14]); b2 = vis_fpadd16(val_m[j + 4 * 2], val_m[j + 4 * 13]); b13 = vis_fpsub16(val_m[j + 4 * 2], val_m[j + 4 * 13]); b3 = vis_fpadd16(val_m[j + 4 * 3], val_m[j + 4 * 12]); b12 = vis_fpsub16(val_m[j + 4 * 3], val_m[j + 4 * 12]); b4 = vis_fpadd16(val_m[j + 4 * 4], val_m[j + 4 * 11]); b11 = vis_fpsub16(val_m[j + 4 * 4], val_m[j + 4 * 11]); b5 = vis_fpadd16(val_m[j + 4 * 5], val_m[j + 4 * 10]); b10 = vis_fpsub16(val_m[j + 4 * 5], val_m[j + 4 * 10]); b6 = vis_fpadd16(val_m[j + 4 * 6], val_m[j + 4 * 9]); b9 = vis_fpsub16(val_m[j + 4 * 6], val_m[j + 4 * 9]); b7 = vis_fpadd16(val_m[j + 4 * 7], val_m[j + 4 * 8]); b8 = vis_fpsub16(val_m[j + 4 * 7], val_m[j + 4 * 8]); /* * second butter-fly */ t0 = vis_fpadd16(b0, b7); t1 = vis_fpadd16(b1, b6); t2 = vis_fpadd16(b2, b5); t3 = vis_fpadd16(b3, b4); t4 = vis_fpsub16(b3, b4); t5 = vis_fpsub16(b2, b5); t6 = vis_fpsub16(b1, b6); t7 = vis_fpsub16(b0, b7); c1 = vis_fpsub16(b13, b10); c2 = vis_fpsub16(b12, b11); c3 = vis_fpadd16(b11, b12); c4 = vis_fpadd16(b10, b13); t10 = vis_fmul8x16(FCOS_4, c1); t11 = vis_fmul8x16(FCOS_4, c2); t12 = vis_fmul8x16(FCOS_4, c3); t13 = vis_fmul8x16(FCOS_4, c4); /* * third butter-fly */ b0 = vis_fpadd16(t0, t3); b1 = vis_fpadd16(t1, t2); b2 = vis_fpsub16(t1, t2); b3 = vis_fpsub16(t0, t3); c1 = vis_fpsub16(t6, t5); c2 = vis_fpadd16(t6, t5); b5 = vis_fmul8x16(FCOS_4, c1); b6 = vis_fmul8x16(FCOS_4, c2); b11 = vis_fpsub16(b8, t11); b8 = vis_fpadd16(b8, t11); b10 = vis_fpsub16(b9, t10); b9 = vis_fpadd16(b9, t10); b12 = vis_fpsub16(b15, t12); b13 = vis_fpsub16(b14, t13); b14 = vis_fpadd16(b14, t13); b15 = vis_fpadd16(b15, t12); /* * fourth butter-fly */ b0 = vis_fmul8x16(COS_4, b0); b1 = vis_fmul8x16(COS_4, b1); c1 = vis_fpadd16(b0, b1); c2 = vis_fpsub16(b0, b1); b0 = SCALE8(c1); b1 = SCALE8(c2); c1 = vis_fmul8x16(SIN_8, b2); c2 = vis_fmul8x16(COS_8, b3); c3 = vis_fmul8x16(SIN_8, b3); c4 = vis_fmul8x16(COS_8, b2); b2 = SCALE8(vis_fpadd16(c1, c2)); b3 = SCALE8(vis_fpsub16(c3, c4)); t5 = vis_fpsub16(t4, b5); t4 = vis_fpadd16(t4, b5); t6 = vis_fpsub16(t7, b6); t7 = vis_fpadd16(t7, b6); c1 = vis_fmul8x16(FSIN_8, b14); c2 = vis_fmul8x16(FCOS_8, b9); c3 = vis_fmul8x16(FSIN_8, b10); c4 = vis_fmul8x16(FCOS_8, b13); t9 = vis_fpsub16(c1, c2); t10 = vis_fpadd16(c3, c4); c1 = vis_fmul8x16(FSIN_8, b13); c2 = vis_fmul8x16(FCOS_8, b10); c3 = vis_fmul8x16(FSIN_8, b9); c4 = vis_fmul8x16(FCOS_8, b14); t13 = vis_fpsub16(c1, c2); t14 = vis_fpadd16(c3, c4); /* * fifth butter-fly */ c1 = vis_fmul8x16(COS_3_16, t6); c2 = vis_fmul8x16(SIN_3_16, t5); c3 = vis_fmul8x16(COS_3_16, t5); c4 = vis_fmul8x16(SIN_3_16, t6); b6 = SCALE8(vis_fpsub16(c1, c2)); b5 = SCALE8(vis_fpadd16(c3, c4)); b9 = vis_fpsub16(b8, t9); b8 = vis_fpadd16(b8, t9); b10 = vis_fpadd16(b11, t10); b11 = vis_fpsub16(b11, t10); b13 = vis_fpsub16(b12, t13); b12 = vis_fpadd16(b12, t13); b14 = vis_fpsub16(b15, t14); b15 = vis_fpadd16(b15, t14); /* * sixth butter-fly */ c1 = vis_fmul8x16(COS_7_32, b14); c2 = vis_fmul8x16(SIN_7_32, b9); c3 = vis_fmul8x16(COS_7_32, b9); c4 = vis_fmul8x16(SIN_7_32, b14); b14 = SCALE8(vis_fpsub16(c1, c2)); b9 = SCALE8(vis_fpadd16(c3, c4)); c1 = vis_fmul8x16(SIN_5_32, b10); c2 = vis_fmul8x16(COS_5_32, b13); c3 = vis_fmul8x16(SIN_5_32, b13); c4 = vis_fmul8x16(COS_5_32, b10); b10 = SCALE8(vis_fpadd16(c1, c2)); b13 = SCALE8(vis_fpsub16(c3, c4)); MLIB_XTRANSPOSE16_4x4(b1, b9, b5, b13, coeffs64, 4 * j + 2); MLIB_XTRANSPOSE16_4x4(b2, b10, b6, b14, coeffs64, 4 * j + 1); c1 = vis_fmul8x16(SIN_16, t4); c2 = vis_fmul8x16(COS_16, t7); c3 = vis_fmul8x16(SIN_16, t7); c4 = vis_fmul8x16(COS_16, t4); b4 = SCALE8(vis_fpadd16(c1, c2)); b7 = SCALE8(vis_fpsub16(c3, c4)); c1 = vis_fmul8x16(SIN_32, b8); c2 = vis_fmul8x16(COS_32, b15); c3 = vis_fmul8x16(SIN_32, b15); c4 = vis_fmul8x16(COS_32, b8); b8 = SCALE8(vis_fpadd16(c1, c2)); b15 = SCALE8(vis_fpsub16(c3, c4)); c1 = vis_fmul8x16(COS_3_32, b12); c2 = vis_fmul8x16(SIN_3_32, b11); c3 = vis_fmul8x16(COS_3_32, b11); c4 = vis_fmul8x16(SIN_3_32, b12); b12 = SCALE8(vis_fpsub16(c1, c2)); b11 = SCALE8(vis_fpadd16(c3, c4)); MLIB_XTRANSPOSE16_4x4(b0, b8, b4, b12, coeffs64, 4 * j); MLIB_XTRANSPOSE16_4x4(b3, b11, b7, b15, coeffs64, 4 * j + 3); } return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
static mlib_status mlib_v_VideoColorYUV2RGB411_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl; /* all. pointer to y */ mlib_d64 *spy; /* all. pointers to u, v */ mlib_d64 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1, fu0, fu1; mlib_d64 dv1, dv2, fv0, fv1; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22; mlib_u8 *tmp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp = (mlib_u8 *)buf; dl = rgb; ddp = (mlib_d64 *)dp; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = (*dfu++); fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = (*dfv++); fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); dd02 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); dd12 = vis_bshuffle(dr1, dg1); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); dd22 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); ddp[3] = vis_bshuffle(dd02, db1); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); ddp[4] = vis_bshuffle(dd12, db1); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); ddp[5] = vis_bshuffle(dd22, db1); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; ddp += 6; } if (i <= width - 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); db = db1; dr = dr1; dg = dg1; ddp += 3; i += 8; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); db = vis_faligndata(db, db); dg = vis_faligndata(dg, dg); dr = vis_faligndata(dr, dr); dp += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3); dl = dp = dl + rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoIDCT8x8_S16_S16_Q1_Mismatch( mlib_s16 *block, const mlib_s16 *coeffs) { mlib_d64 *dPtr = (mlib_d64 *)coeffs; mlib_d64 *outPtr = (mlib_d64 *)block; mlib_d64 dx0, dx1, dx2, dx3, dx4, dx6, dx7, dx8; mlib_d64 p00, p10, p20, p30, p01, p11, p21, p31, p40, p50, p60, p70; mlib_d64 p41, p51, p61, p71; mlib_d64 t0, t1; mlib_d64 d0, d1, d2, d3, d7, zero = vis_fzero(); mlib_f32 COS_1_16; mlib_f32 COS_2_16; mlib_f32 COS_6_16; mlib_f32 COS_7_16; mlib_f32 COS_4_16; mlib_f32 C_1_4; /* First pass */ LOAD_DATA_AA44; COS_1_16 = ((mlib_f32 *)mlib_cTable)[0]; COS_2_16 = ((mlib_f32 *)mlib_cTable)[1]; COS_6_16 = ((mlib_f32 *)mlib_cTable)[2]; COS_7_16 = ((mlib_f32 *)mlib_cTable)[3]; COS_4_16 = ((mlib_f32 *)mlib_cTable)[4]; C_1_4 = ((mlib_f32 *)mlib_cTable)[5]; TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3) LOAD_MISMATCH; IDCT44(d0, d1, d2, d3); p00 = vis_fpadd16(dx7, dx1); p10 = vis_fpadd16(dx3, dx2); p20 = vis_fpadd16(dx0, dx4); p30 = vis_fpadd16(dx8, dx6); p40 = vis_fpsub16(dx8, dx6); p50 = vis_fpsub16(dx0, dx4); p60 = vis_fpsub16(dx3, dx2); p70 = vis_fpsub16(dx7, dx1); /* Special case when element#63 == 1 */ if (coeffs[63] != 1) { IDCTS(zero, d7); p01 = dx1; p11 = dx2; p21 = dx4; p31 = dx6; p41 = vis_fpsub16(zero, dx6); p51 = vis_fpsub16(zero, dx4); p61 = vis_fpsub16(zero, dx2); p71 = vis_fpsub16(zero, dx1); TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3) TRANSPOSE1(p01, p11, p21, p31, d7) /* Second pass */ IDCTS1(d0, d1, d2, d3, d7); TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3) outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); TRANSPOSE1(p41, p51, p61, p71, d7) IDCTS1(d0, d1, d2, d3, d7); outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); return (MLIB_SUCCESS); } else { /* Second pass */ TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3) d7 = *((mlib_d64 *)&val0); IDCTS1(d0, d1, d2, d3, d7); TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3) outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); d7 = *((mlib_d64 *)&val1); IDCTS1(d0, d1, d2, d3, d7); outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1)); outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2)); outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4)); outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6)); outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6)); outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4)); outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2)); outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1)); return (MLIB_SUCCESS); } }
mlib_status mlib_convMxN_8nw_mask(mlib_image *dst, const mlib_image *src, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, const mlib_s32 *kern, mlib_s32 scale, mlib_s32 cmask) { mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; mlib_d64 dd, d0, d1; mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; mlib_u8 *sl, *sp, *dl; mlib_s32 hgt = mlib_ImageGetHeight(src); mlib_s32 wid = mlib_ImageGetWidth(src); mlib_s32 sll = mlib_ImageGetStride(src); mlib_s32 dll = mlib_ImageGetStride(dst); mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); mlib_s32 ssize, xsize, dsize, esize, buff_ind; mlib_d64 *pbuff, *dp; mlib_f32 *karr = (mlib_f32 *) kern; mlib_s32 gsr_scale = (31 - scale) << 3; mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); mlib_s32 i, j, l, chan, testchan; mlib_s32 nchan = mlib_ImageGetChannels(dst); void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); if (n > MAX_N) { buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); if (buffs == NULL) return MLIB_FAILURE; } buff = buffs + 2 * (n + 1); adr_dst += dn * dll + dm * nchan; ssize = wid; dsize = (ssize + 7) / 8; esize = dsize + 4; pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); if (pbuff == NULL) { if (buffs != buffs_local) mlib_free(buffs); return MLIB_FAILURE; } for (i = 0; i < (n + 1); i++) buffs[i] = pbuff + i * esize; for (i = 0; i < (n + 1); i++) buffs[(n + 1) + i] = buffs[i]; buffd = buffs[n] + esize; buffe = buffd + 2 * esize; hgt -= (n - 1); xsize = ssize - (m - 1); vis_write_gsr(gsr_scale + 7); if (nchan == 2) { p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1; } else if (nchan == 3) { p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1; } else { p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1; } testchan = 1; for (chan = 0; chan < nchan; chan++) { buff_ind = 0; sl = adr_src; dl = adr_dst; if ((cmask & testchan) == 0) { testchan <<= 1; continue; } for (l = 0; l < n; l++) { mlib_d64 *buffn = buffs[l]; sp = sl + l * sll; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); } /* init buffer */ #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } for (j = 0; j < hgt; j++) { mlib_d64 **buffc = buffs + buff_ind; mlib_f32 *pk = karr, k0, k1, k2, k3; sp = sl + n * sll; for (l = 0; l < n; l++) { buff[l] = buffc[l]; } buffn = buffc[n]; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); ik_last = (m - 1); for (jk = 0; jk < n; jk += jk_size) { jk_size = n - jk; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; coff = 0; if (jk_size == 1) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; doff = coff / 8; buff0 = buff[jk] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += m; } else if (jk_size == 2) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d0 = vis_fpadd16(d10, d0); d1 = vis_fpadd16(d01, d1); d1 = vis_fpadd16(d11, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += 2 * m; } else if (jk_size == 3) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 3 * m; } else { /* jk_size == 4 */ for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; k3 = pk[ik + 3 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; buff3 = buff[jk + 3] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; s3 = buff3[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d30 = vis_fmul8x16au(vis_read_lo(s30), k3); d31 = vis_fmul8x16au(vis_read_hi(s31), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 4 * m; } } /***************************************** ***************************************** ** Final iteration ** ***************************************** *****************************************/ jk_size = n; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; k0 = karr[ik_last]; k1 = karr[ik_last + m]; k2 = karr[ik_last + 2 * m]; k3 = karr[ik_last + 3 * m]; off = ik_last; doff = off / 8; off &= 7; buff0 = buff[0] + doff; buff1 = buff[1] + doff; buff2 = buff[2] + doff; buff3 = buff[3] + doff; vis_write_gsr(gsr_scale + off); if (jk_size == 1) { dp = buffe; s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d1 = vis_fpadd16(d1, d01); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 2) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 3) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else { /* if (jk_size == 4) */ dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d0 = vis_fpadd16(d0, d30); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); d1 = vis_fpadd16(d1, d31); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan); sl += sll; dl += dll; buff_ind++; if (buff_ind >= (n + 1)) buff_ind = 0; } testchan <<= 1; } mlib_free(pbuff); if (buffs != buffs_local) mlib_free(buffs); return MLIB_SUCCESS; }