void mlib_v_ImageAffineTableLine_8nw_3_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; mlib_d64 yFilter2; mlib_d64 yFilter3; mlib_d64 row20, row30; mlib_d64 *dpSrc; mlib_d64 data0, data1, zero; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; zero = vis_to_double_dup(0); #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_3x2; FILTER_MERGE_4x2; MAKE_4x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, yFilter); xFilter = vis_write_hi(xFilter, fx0); LOAD_PIXEL_3; v0 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); v1 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); sum = vis_fpadd16(v0, sum); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_hi(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
void mlib_v_ImageAffineTableLine_8nw_2_2_1( mlib_d64 *buff, const mlib_d64 *filterX, const mlib_d64 *filterY, const mlib_u8 **lineAddr, mlib_affine_workspace *ws) { DECLAREVAR; DECLAREVAR2; vis_write_gsr64((((mlib_u64)0x0145ABEF) << 32) + 4); dstPixelPtr = (mlib_s16 *)buff; #pragma pipeloop(0) for (i = 0; i <= size - 2; i += 2) { CALC_2_SRC_PTR; LOAD_2x2(row00, row10); FILTER_MERGE; MAKE_2x2; *buff1 = res1; buff1++; } dstPixelPtr = (mlib_s16 *)buff1; #pragma pipeloop(0) for (; i < size; i++) { CALC_SRC_PTR(sPtr); LOAD_FILTERS(fx0, fy0); xFilter = vis_write_lo(xFilter, fx0); row00 = vis_fpmerge(LD_U8(sPtr, 0), LD_U8(sPtr, 1)); row10 = vis_fpmerge(LD_U8(sPtr, srcStride), LD_U8(sPtr, srcStride + 1)); v0 = vis_fmul8x16au(vis_read_lo(row00), fy0); v1 = vis_fmul8x16al(vis_read_lo(row10), fy0); sum = vis_fpadd16(v0, v1); v0 = vis_fmul8sux16(sum, xFilter); v1 = vis_fmul8ulx16(sum, xFilter); v3 = vis_fpadd16(v1, v0); v2 = vis_fmuld8ulx16(vis_scale, vis_read_lo(v3)); res = vis_write_lo(res, vis_fpadd32s(vis_read_hi(v2), vis_read_lo(v2))); vis_st_u16(res, dstPixelPtr++); } }
mlib_status __mlib_VideoUpSample420_Nearest( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)src; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 sa, da; mlib_s32 emask, i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(0) for (i = 0; i <= (n - 8); i += 8) { sa = *sp; *dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); *(dp0 + 1) = *(dp1 + 1) = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); sp++; dp0 += 2; dp1 += 2; } if (i < n) { sa = vis_ld_d64_nf(sp); da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoInterpX_S16_U8_16x16( mlib_s16 *mc_block, const mlib_u8 *ref_block, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_s32 y; mlib_d64 *dd, ss0[16 * 2], *sp1, *sp2, s1hi, s1lo, s2hi, s2lo, s2, s3; mlib_f32 strunc = vis_read_hi(*(mlib_d64 *)mlib_IX16const); mlib_f32 fexpd = vis_read_lo(*(mlib_d64 *)mlib_IX16const); dd = (mlib_d64 *)mc_block; sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0); #pragma pipeloop(0) MLIB_V_VIDEOCOPY16(16); sp2 = (mlib_d64 *)vis_alignaddr((void *)(ref_block + 1), 0); #pragma pipeloop(0) MLIB_V_VIDEOINTERP16(16); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_f32 *pd = (mlib_f32 *)dst; mlib_d64 d0; mlib_d64 tmp0, tmp1, data; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, bias = 0; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { d0 = (*sp0++); tmp0 = vis_fpmerge(vis_read_hi(d0), vis_read_lo(d0)); tmp1 = vis_fpmerge(vis_read_hi(tmp0), vis_read_lo(tmp0)); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp1), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp1), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16(data); } dst = (mlib_u8 *)pd; for (; i < n; i += 2) { (*dst++) = (src[i] + src[i + 1] + bias) >> 1; /* 1=>2, 2=>1 */ bias ^= 1; } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorSplit3_S16( mlib_s16 *color1, mlib_s16 *color2, mlib_s16 *color3, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr(4); vis_write_bmask(0x02CE13DF, 0); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i <= (n - 4); i += 4) { sd0 = sp[0]; sd1 = sp[1]; sd2 = sp[2]; dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); (*dp0++) = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); (*dp1++) = vis_bshuffle(dd3, dd3); (*dp2++) = vis_bshuffle(dd1, dd2); sp += 3; } /* * last 4 pixels */ if (i < n) { mlib_s32 emask = 0xF0 >> (n & 3); mlib_d64 st0, st1, st2; sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); sd2 = vis_ld_d64_nf(sp + 2); dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); st0 = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); st1 = vis_bshuffle(dd3, dd3); st2 = vis_bshuffle(dd1, dd2); vis_pst_16(st0, dp0, emask); vis_pst_16(st1, dp1, emask); vis_pst_16(st2, dp2, emask); }
mlib_status __mlib_VideoAddBlock_U8_S16( mlib_u8 *curr_block, const mlib_s16 *mc_block, mlib_s32 stride) { mlib_s32 y; mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd; mlib_f32 zeros = vis_fzeros(); /* * mlib_s32 mlib_imult = 0x100; * mlib_f32 mult = *(mlib_f32*) & mlib_imult; */ mlib_f32 mult = vis_to_float(0x100); vis_write_gsr(7 << 3); dp = (mlib_d64 *)curr_block; sp = (mlib_d64 *)mc_block; #pragma pipeloop(0) for (y = 0; y < 8; y++) { dd = *dp; s1hi = (*sp++); s1lo = (*sp++); s2hi = vis_fpmerge(zeros, vis_read_hi(dd)); s2lo = vis_fmul8x16al(vis_read_lo(dd), mult); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); *dp = vis_fpack16_pair(s1hi, s1lo); dp = (mlib_d64 *)((mlib_u8 *)dp + stride); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_d64 *pd = (mlib_d64 *)dst; mlib_d64 d0; mlib_d64 tmp, data0, data1; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, edge; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); vis_write_bmask(0x02461357, 0); #pragma pipeloop(0) for (i = 0; i <= n - 16; i += 16) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16_pair(data0, data1); } if (i < n) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = vis_ld_d64_nf(sp0); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); edge = vis_edge8(pd, (dst + (n / 2) - 1)); vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge); } return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
mlib_status __mlib_VideoColorJFIFYCC2RGB420_Nearest( mlib_u8 *rgb0, mlib_u8 *rgb1, const mlib_u8 *y0, const mlib_u8 *y1, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 n) { /* pointers to dst address */ mlib_u8 *dp1, *dp2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 du, dv; /* (1.00000, 1.40200)*8192 */ mlib_f32 k12 = vis_to_float(0x20002cdd); /* (-.34414, -.71414)*8192 */ mlib_f32 k34 = vis_to_float(0xf4fde926); /* 1.77200*8192 */ mlib_f32 k5 = vis_to_float(0x10038b4); /* (179.45600 - 0.5)*32 */ mlib_d64 k_179_456 = vis_to_double(0x165f165f, 0x165f165f); /* (135.45984 + 0.5)*32 */ mlib_d64 k_135_45984 = vis_to_double(0x10ff10ff, 0x10ff10ff); /* (226.81600 - 0.5)*32 */ mlib_d64 k_226_816 = vis_to_double(0x1c4a1c4a, 0x1c4a1c4a); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variable */ mlib_s32 i; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; if (n <= 0) return (MLIB_FAILURE); /* * initialize GSR scale factor */ vis_write_gsr((2 << 3) + 7); dp1 = (mlib_u8 *)rgb0; dp2 = (mlib_u8 *)rgb1; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; spy1 = (mlib_d64 *)y0; spy2 = (mlib_d64 *)y1; dfu = (mlib_f32 *)cb; dfv = (mlib_f32 *)cr; fu = vis_ld_f32_nf(dfu); dfu++; fv = vis_ld_f32_nf(dfv); dfv++; du = vis_fpmerge(fu, fu); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= n - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_45984); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_45984); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_226_816); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_226_816); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_179_456); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_179_456); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); fu = vis_ld_f32_nf(dfu); dfu++; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); fv = vis_ld_f32_nf(dfv); dfv++; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); du = vis_fpmerge(fu, fu); red1 = vis_fpack16_to_lo(red1, temp_r_lo); dv = vis_fpmerge(fv, fv); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy1 = vis_ld_d64_nf(spy1); spy1++; dy2 = vis_ld_d64_nf(spy2); spy2++; ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(n - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((n - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((n - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < n; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_S8_Mod( mlib_s32 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *psrc = (mlib_s8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fone = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done = vis_to_double_dup(0x1000100); mlib_s32 i = 0; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = vis_ld_d64_nf(dpsrc); vis_write_bmask(0x00012223, 0); if ((mlib_addr)psrc & 7) { dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_faligndata(dsrc, dsrc1); #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3)); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 2); dsrc = vis_faligndata(dsrc0, dsrc1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_bshuffle(dst3, dst3); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorMerge3_S16( mlib_s16 *colors, const mlib_s16 *color1, const mlib_s16 *color2, const mlib_s16 *color3, mlib_s32 n) { mlib_d64 *dp = (mlib_d64 *)colors; mlib_d64 *sp0 = (mlib_d64 *)color1; mlib_d64 *sp1 = (mlib_d64 *)color2; mlib_d64 *sp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, sd3, sd4, sd5; mlib_d64 dd0, dd1, dd2, dd3, dd4, dd5; mlib_s32 i; #pragma pipeloop(1) for (i = 0; i <= (n - 8); i += 8) { sd0 = sp0[0]; sd1 = sp1[0]; sd2 = sp2[0]; sd3 = sp0[1]; sd4 = sp1[1]; sd5 = sp2[1]; vis_write_bmask(0x018923ab, 0); dd0 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(sd1, sd2); dd2 = vis_bshuffle(sd3, sd4); dd5 = vis_bshuffle(sd4, sd5); vis_write_bmask(0x45cd67ef, 0); dd1 = vis_bshuffle(sd0, sd1); dd4 = vis_bshuffle(sd3, sd4); vis_write_bmask(0x01238945, 0); dp[0] = vis_bshuffle(dd0, sd2); dp[3] = vis_bshuffle(dd2, sd5); dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1)); dp[4] = vis_freg_pair(vis_read_lo(dd5), vis_read_hi(dd4)); vis_write_bmask(0xcd4567ef, 0); dp[2] = vis_bshuffle(dd1, sd2); dp[5] = vis_bshuffle(dd4, sd5); sp0 += 2; sp1 += 2; sp2 += 2; dp += 6; } if (i <= (n - 4)) { sd0 = sp0[0]; sd1 = sp1[0]; sd2 = sp2[0]; vis_write_bmask(0x018923ab, 0); dd0 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(sd1, sd2); vis_write_bmask(0x45cd67ef, 0); dd1 = vis_bshuffle(sd0, sd1); vis_write_bmask(0x01238945, 0); dp[0] = vis_bshuffle(dd0, sd2); dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1)); vis_write_bmask(0xcd4567ef, 0); dp[2] = vis_bshuffle(dd1, sd2); sp0++; sp1++; sp2++; dp += 3; } for (; i < n; i++) { colors[3 * i] = ((mlib_u16 *)color1)[i]; colors[3 * i + 1] = ((mlib_u16 *)color2)[i]; colors[3 * i + 2] = ((mlib_u16 *)color3)[i]; } return (MLIB_SUCCESS); }
void ADD_SUFF(UshortGrayToByteGrayConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u8 *dst_end; mlib_d64 s0, s1, ss; mlib_s32 i, j; if (width <= 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u8 *dst = dstBase; for (i = 0; i < width; i++) { dst[i] = src[2*i]; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == 2*width && dstScan == width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u8 *dst = dstBase; mlib_d64 *sp; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { *dst++ = *src; src += 2; } if ((mlib_s32)src & 7) { sp = vis_alignaddr(src, 0); s1 = *sp++; #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s0 = s1; s1 = *sp++; ss = vis_faligndata(s0, s1); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); *(mlib_f32*)dst = vis_read_hi(ss); src += 2*4; } } else { #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { ss = *(mlib_d64*)src; ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); ss = vis_fpmerge(vis_read_hi(ss), vis_read_lo(ss)); *(mlib_f32*)dst = vis_read_hi(ss); src += 2*4; } } while (dst < dst_end) { *dst++ = *src; src += 2; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
mlib_status __mlib_VectorConvert_S32_U8_Mod( mlib_s32 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *psrc = (mlib_u8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fzero = vis_fzero(), fone1 = vis_to_float(0x100), fone2 = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3; mlib_s32 i = 0, off; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = dpsrc[0]; off = (mlib_addr)psrc & 7; if (off) { dsrc1 = dsrc; vis_alignaddr((void *)0, 7); vis_write_bmask(0x11111111 * off, 0x40516273); #pragma pipeloop(0) #pragma unroll(2) for (; i <= (n - 8); i += 8) { dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_bshuffle(dsrc0, dsrc1); dst0 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst1 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); dsrc = vis_faligndata(dsrc, dsrc); dst2 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fmul8x16al(vis_read_hi(dsrc), fone1); dst0 = vis_fpmerge(fzero, vis_read_hi(dst1)); dst1 = vis_fpmerge(fzero, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst2 = vis_fmuld8ulx16(vis_read_hi(dst3), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dst3), fone2); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
void ADD_SUFF(ByteGrayToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; for (i = 0; i < width; i++) { x = src[i]; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == width && dstScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; dst_end = dst + width; while (((mlib_s32)src & 3) && dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { ff = *(mlib_f32*)src; d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); src += 4; } while (dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
void ADD_SUFF(ByteGrayToIntArgbScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3, dd; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 16) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); for (i = 0; i < width; i++) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } return; } vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { LOAD_NEXT_U8(dd, src + ((tmpsxloc + 3*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + 2*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc ) >> shift)); tmpsxloc += 4*sxinc; ff = vis_read_hi(dd); d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); } while (dst < dst_end) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } }
static mlib_status mlib_v_VideoColorYUV2RGB420_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp11, *sp12, *sl11, *sl12; /* pointers to dst address */ mlib_u8 *dp1, *dl1; /* pointers to dst address */ mlib_u8 *dp2, *dl2; /* all. pointer to y */ mlib_d64 *spy1, *spy2; /* all. pointers to u, v */ mlib_f32 *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3, dy4, dy5; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 z_11644_hi, z_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; /* loop variables */ mlib_s32 i, j; mlib_s32 y_stride2 = 2 * y_stride; mlib_s32 rgb_stride2 = 2 * rgb_stride; mlib_s32 off2, off3; mlib_d64 red1, green1, blue1, *ddp1, dd01, dd11, dd21; mlib_d64 red2, green2, blue2, *ddp2, dd02, dd12, dd22; mlib_d64 *buf1, BUFF1[16 * 1024]; mlib_d64 *buf2, BUFF2[16 * 1024]; mlib_u8 *tmp1, *tmp2; if (width * 3 > 16 * 1024) { tmp1 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); tmp2 = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf1 = (mlib_d64 *)((mlib_addr)(tmp1 + 7) & ~7); buf2 = (mlib_d64 *)((mlib_addr)(tmp2 + 7) & ~7); } else { buf1 = (mlib_d64 *)BUFF1; buf2 = (mlib_d64 *)BUFF2; } /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp11 = sl11 = (mlib_u8 *)y; sp12 = sl12 = (mlib_u8 *)y + y_stride; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; dl1 = (mlib_u8 *)rgb; dl2 = (mlib_u8 *)(rgb + rgb_stride); ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; /* * row loop */ for (j = 0; j < height / 2; j++) { spy1 = (mlib_d64 *)vis_alignaddr(sp11, 0); spy2 = (mlib_d64 *)vis_alignaddr(sp12, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; vis_alignaddr((void *)off2, 0); fu0 = (*dfu++); fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = (*dfv++); fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dy0 = (*spy1++); dy4 = (*spy2++); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(red1, green1); dd02 = vis_bshuffle(red2, green2); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(red1, green1); dd12 = vis_bshuffle(red2, green2); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(red1, green1); dd22 = vis_bshuffle(red2, green2); vis_write_bmask(0x01834967, 0); ddp1[0] = vis_bshuffle(dd01, blue1); ddp2[0] = vis_bshuffle(dd02, blue2); vis_write_bmask(0xA12B45C7, 0); ddp1[1] = vis_bshuffle(dd11, blue1); ddp2[1] = vis_bshuffle(dd12, blue2); vis_write_bmask(0x0D23E56F, 0); ddp1[2] = vis_bshuffle(dd21, blue1); ddp2[2] = vis_bshuffle(dd22, blue2); dy3 = vis_ld_d64_nf(spy1); spy1++; vis_alignaddr(sp11, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; dy5 = vis_ld_d64_nf(spy2); spy2++; vis_alignaddr(sp12, 0); dy2 = vis_faligndata(dy4, dy5); dy4 = dy5; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* Z*1.1644 */ z_11644_hi = vis_fmul8x16au(vis_read_hi(dy2), k12); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* Z*1.1644 */ z_11644_lo = vis_fmul8x16au(vis_read_lo(dy2), k12); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green1 = vis_fpack16_to_hi(green1, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue1 = vis_fpack16_to_hi(blue1, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; red1 = vis_fpack16_to_hi(red1, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; green1 = vis_fpack16_to_lo(green1, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue1 = vis_fpack16_to_lo(blue1, temp_b_lo); red1 = vis_fpack16_to_lo(red1, temp_r_lo); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); temp_g_hi = vis_fpadd16(g_hi, z_11644_hi); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); temp_b_hi = vis_fpadd16(b_hi, z_11644_hi); green2 = vis_fpack16_to_hi(green2, temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, z_11644_hi); blue2 = vis_fpack16_to_hi(blue2, temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, z_11644_lo); red2 = vis_fpack16_to_hi(red2, temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, z_11644_lo); green2 = vis_fpack16_to_lo(green2, temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, z_11644_lo); blue2 = vis_fpack16_to_lo(blue2, temp_b_lo); red2 = vis_fpack16_to_lo(red2, temp_r_lo); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); ddp1 += 3; ddp2 += 3; } dp1 = (mlib_u8 *)ddp1; dp2 = (mlib_u8 *)ddp2; vis_alignaddr((void *)(width - i), 0); blue1 = vis_faligndata(blue1, blue1); green1 = vis_faligndata(green1, green1); red1 = vis_faligndata(red1, red1); dp1 += ((width - i - 1) * 3); blue2 = vis_faligndata(blue2, blue2); green2 = vis_faligndata(green2, green2); red2 = vis_faligndata(red2, red2); dp2 += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL1(0, 1, 2); STORE_PIXEL2(0, 1, 2); dp1 -= 3; dp2 -= 3; } sp11 = sl11 = sl11 + y_stride2; sp12 = sl12 = sl12 + y_stride2; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl1, (mlib_u8 *)buf1, width * 3); __mlib_VectorCopy_U8(dl2, (mlib_u8 *)buf2, width * 3); dl1 = dp1 = dl1 + rgb_stride2; dl2 = dp2 = dl2 + rgb_stride2; dp1 = (mlib_u8 *)buf1; dp2 = (mlib_u8 *)buf2; ddp1 = (mlib_d64 *)dp1; ddp2 = (mlib_d64 *)dp2; } if (width * 3 > 16 * 1024) { __mlib_free(tmp1); __mlib_free(tmp2); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2RGB( mlib_u8 *rgb, const mlib_u8 *argb, mlib_s32 n) { /* 8-byte aligned start points in dst */ mlib_f32 *dp = (mlib_f32 *)rgb; /* 8-byte aligned start point in src */ mlib_d64 *sp = (mlib_d64 *)argb; /* 8-byte source data */ mlib_d64 sd0, sd1, sd2, sd3; /* intermediate variables */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; /* dst data */ mlib_d64 dd0, dd1, dd2; mlib_s32 i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(1) for (i = 0; i <= (n - 16); i += 16) { /* --r0g0b0--r1g1b1 */ sd0 = sp[0]; /* --r2g2b2--r3g3b3 */ sd1 = sp[1]; /* --r4g4b4--r5g5b5 */ sd2 = sp[2]; /* --r6g6b6--r7g7b7 */ sd3 = sp[3]; EXTRACT_U8_43R; dp[0] = vis_read_hi(dd0); dp[3] = vis_read_lo(dd0); dp[1] = vis_read_hi(dd1); dp[4] = vis_read_lo(dd1); dp[2] = vis_read_hi(dd2); dp[5] = vis_read_lo(dd2); SPLIT_S32(((mlib_u32 *)dp)[6], ((mlib_u32 *)dp)[7], ((mlib_u32 *)dp)[8], ((mlib_u32 *)sp)[8], ((mlib_u32 *)sp)[9], ((mlib_u32 *)sp)[10], ((mlib_u32 *)sp)[11]); SPLIT_S32(((mlib_u32 *)dp)[9], ((mlib_u32 *)dp)[10], ((mlib_u32 *)dp)[11], ((mlib_u32 *)sp)[12], ((mlib_u32 *)sp)[13], ((mlib_u32 *)sp)[14], ((mlib_u32 *)sp)[15]); sp += 8; dp += 12; } #pragma unroll(1) for (; i < n; i++) { SPLIT_S32_3_U8(((mlib_u32 *)argb)[i], rgb[3 * i], rgb[3 * i + 1], rgb[3 * i + 2]); } return (MLIB_SUCCESS); }
static void mlib_VectorDotProd_U8C_al_x( mlib_d64 *z, const void *x, const void *y, mlib_s32 n) /* The case of even address of vector x */ { mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y; mlib_d64 sum_r = 0.0, sum_i = 0.0; mlib_d64 *dpx, *dpy, *dpxend; mlib_d64 dx, dy, dy0, dy1; mlib_d64 dx_r, dy_r, dy_i; mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl; mlib_d64 d_ih, d_il, d_rh, d_rl; mlib_d64 ds_r, ds_i, ds1_r, ds1_i; mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF); mlib_d64 edge[2]; mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask, off; mlib_d64 done = vis_to_double_dup(0x1000100); edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); off = (mlib_addr)dpx - (mlib_addr)px; dpy = vis_alignaddr((void *)py, off); pxend = px + n + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; dy = vis_ld_d64_nf(dpy); if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) { while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dx = dpx[1]; dy = dpy[1]; dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } else { dy1 = vis_ld_d64_nf(dpy+1); dy = vis_faligndata(dy, dy1); while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dy0 = dy1; dy1 = vis_ld_d64_nf(dpy+2); dx = vis_ld_d64_nf(dpx+1); dy = vis_faligndata(dy0, dy1); dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; DPROD_U8C; SUM_U8C_TAIL; fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = sum_r; z[1] = sum_i; #undef MAX_LOOP }
static mlib_status mlib_v_VideoColorYUV2RGB444_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 yuv_stride) { /* all. pointer to y, u, v */ mlib_d64 *spy, *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0); mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4); mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2; /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_u8 *tmp, *dp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); if (tmp == NULL) return (MLIB_FAILURE); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; for (j = 0; j < height; j++) { dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0); du0 = (*dfu++); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0); dv0 = (*dfv++); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); spy = (mlib_d64 *)vis_alignaddr((void *)y, 0); dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd0 = vis_bshuffle(red, green); vis_write_bmask(0x03B04C05, 0); dd1 = vis_bshuffle(red, green); vis_write_bmask(0xD06E07F0, 0); dd2 = vis_bshuffle(red, green); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd0, blue); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd1, blue); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd2, blue); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; ddp += 3; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); blue = vis_faligndata(blue, blue); green = vis_faligndata(green, green); red = vis_faligndata(red, red); dp += ((width - i - 1) * 3); vis_alignaddr((void *)spy, 7); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } __mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3); rgb += rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; y += yuv_stride; u += yuv_stride; v += yuv_stride; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
static void mlib_v_VectorDistance_S8_Sat_A8( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { mlib_d64 *pdx, dx; mlib_d64 *pdy, dy; mlib_s8 *px, *py; mlib_d64 accd, accum; mlib_s32 *pacc; mlib_d64 item; mlib_d64 ones = vis_to_double_dup(0x01000100); /* temporaries used in macro */ mlib_d64 dsrc1h, dsrc1l; /* temporaries used in macro */ mlib_d64 dsrc2h, dsrc2l; /* temporaries used in macro */ mlib_d64 ddiffh, ddiffl; /* temporaries used in macro */ mlib_f32 fdifhh, fdifhl; /* temporaries used in macro */ mlib_f32 fdiflh, fdifll; /* temporaries used in macro */ mlib_d64 dsqrhh, dsqrhl; /* temporaries used in macro */ mlib_d64 dsqrlh, dsqrll; /* temporaries used in macro */ mlib_d64 dsqrh, dsqrl; /* temporaries used in macro */ mlib_d64 dsqr; mlib_s32 i, nd8, nm8; nd8 = n >> 3; nm8 = n & 0x7; pdx = (mlib_d64 *)x; pdy = (mlib_d64 *)y; accd = 0.0; vis_alignaddr(pdx, 1); /* #pragma pipeloop(0) */ for (i = 0; i < nd8; i++) { dx = *pdx; pdx++; dy = *pdy; pdy++; MLIB_V_VECTORDISTANCE_S8_A8(dx, dy, accd); } fdifhh = vis_fpadd32s(vis_read_hi(accd), vis_read_lo(accd)); pacc = (mlib_s32 *)&fdifhh; accum = (mlib_d64)pacc[0]; if (nm8 != 0) { px = (mlib_s8 *)pdx; py = (mlib_s8 *)pdy; for (i = 0; i < nm8; i++) { item = (mlib_d64)(*px - *py); accum += item * item; px++; py++; } } z[0] = mlib_sqrt(accum); }
mlib_status __mlib_VectorConvert_S16_U8_Mod( mlib_s16 *z, const mlib_u8 *x, mlib_s32 n) { mlib_s32 i; const mlib_u8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_f32 fzero = vis_fzeros(); mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 fm = vis_to_float(0x100); if (length < 16) { EXPAND(mlib_u8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; if (!((mlib_addr)src & 7)) { /* * Both vectors are 64-bit aligned. We can process without * vis_faligndata * Peeling the 1 iteration. Then loop with step==2. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling the 1 iteration. Then loop with step==2. */ i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd3 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); } #pragma pipeloop(0) #pragma unroll(2) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorCMYK2JFIFYCCK444( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, mlib_u8 *k, const mlib_u8 *cmyk, mlib_s32 n) { mlib_d64 buff_arr[(SIZE / 2) + 2]; mlib_f32 *py, *pcb, *pcr, *pk; mlib_d64 *buff; mlib_d64 sdh, sdl, dr, dg, db, dd; mlib_s32 i, m, size, num; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 8192)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 8192)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 8192)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 8192)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 8192)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 8192)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off255 = vis_to_double_dup(0x1ff01ff0); vis_write_gsr(2 << 3); /* * 4-pixel loop */ for (size = 0; size < n; size += num) { num = n - size; if (num > SIZE) num = SIZE; m = (num + 3) / 4; mlib_channel_separate((mlib_d64 *)cmyk + size / 2, buff_arr, m); m = (num / 4) & ~1; py = (mlib_f32 *)y + size / 4; pcb = (mlib_f32 *)cb + size / 4; pcr = (mlib_f32 *)cr + size / 4; pk = (mlib_f32 *)k + size / 4; buff = buff_arr; #pragma pipeloop(0) for (i = 0; i < m; i++) { sdh = buff[0]; sdl = buff[1]; CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, py[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, pcb[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, pcr[0]); py++; pcb++; pcr++; (*pk++) = vis_read_lo(sdl); buff += 2; } } if (n & 7) { mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF; mlib_d64 rbuff[4]; mlib_f32 *prbuff = (mlib_f32 *)rbuff; sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[2]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[4]); prbuff[6] = vis_read_lo(sdl); sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[1]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[3]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[5]); prbuff[7] = vis_read_lo(sdl); vis_pst_8(rbuff[0], py, emask); vis_pst_8(rbuff[1], pcb, emask); vis_pst_8(rbuff[2], pcr, emask); vis_pst_8(rbuff[3], pk, emask); }
mlib_status __mlib_VectorConvert_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s32 i; const mlib_s8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 fzero = vis_fzeros(); mlib_s32 len_64, even_length, rest_64, length = n, off; mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7; if (length < 16) { EXPAND(mlib_s8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; off = (mlib_addr)src & 7; if (!off) { /* * Both vectors are 64-bit aligned. */ /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2), fzero), four_16_ones); } } else { /* * Source vector is not 64-bit aligned. * Peeling of 1 iteration. Then loop with step==2. */ vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd4 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i <= len_64; i += 2) { dd0 = dd2; dd1 = vis_ld_d64_nf(ddsrc + i); dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_bshuffle(dd0, dd1); dd6 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); dd7 = vis_faligndata(dd6, dd6); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); (*ddst++) = vis_fmul8sux16(dd6, four_16_ones); (*ddst++) = vis_fmul8sux16(dd7, four_16_ones); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status FUNC( MxN) ( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; mlib_u8 *sl, *dl; mlib_u8 *lut_table; mlib_s32 offset, off, kw, dn1; mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh; mlib_s32 row, i, j, bsize, buff_ind = 0, func_ind, method; mlib_u16 *pbuff, *buff_lcl[2 * MAX_N], **buff_arr = buff_lcl, **buff; mlib_d64 *buffd; mlib_d64 kern_lcl[MAX_N * MAX_M], *kern = kern_lcl, *pkern; mlib_d64 dscale; func_dm_type func_dm; mlib_s32 vis_scale, kern_i; mlib_s32 kern_size, isum; mlib_d64 sum, norm; mlib_f32 fscale; mlib_s32 bit_offset; mlib_u8 *buff_dst; MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl); bit_offset = mlib_ImageGetBitOffset(dst); if (!(stype == MLIB_BYTE && schan == 1)) { return (MLIB_FAILURE); } #if 0 for (i = 0; i <= m * dn + dm; i++) { if (kernel[i]) return (MLIB_FAILURE); } #endif /* 0 */ dn = n - 1 - dn; dm = m - 1 - dm; kern_size = m * dn + dm; if (n > MAX_N || m > MAX_M) { kern = __mlib_malloc(n * m * sizeof (mlib_d64) + 2 * n * sizeof (mlib_u16 *)); if (kern == NULL) return (MLIB_FAILURE); buff_arr = (mlib_u16 **)(kern + n * m); } dscale = 1.0; while (scale > 30) { dscale *= 1.0 / (1 << 30); scale -= 30; } dscale /= (1 << scale); /* load kernel */ kernel += m * n - 1; sum = 0; for (i = 0; i < kern_size; i++) { kern[i] = dscale * kernel[-i]; sum += mlib_fabs(kern[i]); } vis_scale = mlib_ilogb(sum); if (vis_scale > 13) return (MLIB_OUTOFRANGE); vis_scale = 14 - vis_scale; if (vis_scale > 15) vis_scale = 15; norm = 32768 >> (15 - vis_scale); isum = 0; for (i = 0; i < kern_size; i++) { if (kern[i] > 0.0) { kern_i = (mlib_s32)(kern[i] * norm + 0.5); } else { kern_i = (mlib_s32)(kern[i] * norm - 0.5); } isum += abs(kern_i); kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff)); } /* recalc without rounding */ if (isum > 32767) { dscale *= norm; for (i = 0; i < kern_size; i++) { kern_i = (mlib_s32)(dscale * kernel[-i]); kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff)); } } fscale = vis_to_float(1 << (vis_scale - 1)); vis_write_gsr(((16 - vis_scale) << 3) + 2); offset = mlib_ImageGetLutOffset(colormap); lut_table = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap); bsize = (sw + m) * NCHAN; bsize = (bsize + 7) & ~7; dn1 = (dn) ? dn : 1; pbuff = __mlib_malloc((dn1 + 1) * bsize * sizeof (mlib_u16) + EXTRA_BUFF); if (pbuff == NULL) { if (kern != kern_lcl) __mlib_free(kern); return (MLIB_FAILURE); } for (j = 0; j < dn1; j++) { buff_arr[dn1 + j] = buff_arr[j] = pbuff + j * bsize; } buff_ind = 0; buffd = (mlib_d64 *)(pbuff + dn1 * bsize); buff_dst = (mlib_u8 *)((mlib_u16 *)buffd + bsize); /* clear buffer */ for (i = 0; i < dn * (bsize / 4); i++) { ((mlib_d64 *)pbuff)[i] = 0; } func_ind = dm; if (func_ind > KH_MAX) func_ind = KH_MAX; method = mlib_ImageGetMethod(colormap); if (method == LUT_COLOR_CUBE_SEARCH) func_ind += KH_MAX + 1; else if (method == LUT_COLOR_DIMENSIONS) func_ind += 2 * (KH_MAX + 1); func_dm = func_dm_arr[func_ind]; for (row = 0; row < sh; row++) { mlib_u8 *sp = sl; buff = buff_arr + buff_ind; /* convert source line */ for (i = 0; i < sw; i++) { mlib_d64 ss; ss = LD_U8(sp, i); ss = vis_fmul8x16al(vis_read_lo(ss), fscale); ST_U16(buffd, i, ss); } pkern = kern; for (j = 0; j < dn; j++) { for (off = 0; off < m; off += kw) { kw = m - off; if (kw > KW_MAX) { if (kw > 2 * KW_MAX) kw = KW_MAX; else kw = kw / 2; } func_m_arr[kw] (buffd, buff[j] + off * NCHAN, pkern + off, sw); } pkern += m; } #ifdef USE_COLOR2INDEXLINE func_dm(buff_dst, (void *)buffd, buff[dn] + dm * NCHAN, pkern, colormap, lut_table, sw, dm, 0); /* * mlib_ImageColorTrue2IndexLine_U8_BIT_1 * (buff_dst, dl, bit_offset, sw, colormap); */ #else /* USE_COLOR2INDEXLINE */ func_dm(dl, (void *)buffd, buff[dn] + dm * NCHAN, pkern, colormap, lut_table, sw, dm, bit_offset); #endif /* USE_COLOR2INDEXLINE */ buff_ind++; if (buff_ind >= dn1) buff_ind -= dn1; sl += sll; dl += dll; } __mlib_free(pbuff); if (kern != kern_lcl) __mlib_free(kern); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_S16_Mod( mlib_s32 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s32 i; const mlib_s16 *src = x; mlib_s32 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 two_16_ones = vis_to_float(0x10001); if (length < 16) { EXPAND(mlib_s16, mlib_s32); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; dd2 = ddsrc[0]; if (!((mlib_addr)(src) & 7)) { /* * Source vector is 64-bit aligned. We can process it without * vis_faligndata. * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd3 = (*ddsrc++); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd3 = (*ddsrc++); dd4 = (*ddsrc++); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling of 1 iteration. */ i = 1; if ((len_64 & 1)) { i++; dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); dd3 = vis_faligndata(dd1, dd2); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); } /* * Now loop with step == 2. */ #pragma pipeloop(1) #pragma unroll(1) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2RGB411_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl; /* all. pointer to y */ mlib_d64 *spy; /* all. pointers to u, v */ mlib_d64 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1, fu0, fu1; mlib_d64 dv1, dv2, fv0, fv1; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22; mlib_u8 *tmp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp = (mlib_u8 *)buf; dl = rgb; ddp = (mlib_d64 *)dp; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = (*dfu++); fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = (*dfv++); fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); dd02 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); dd12 = vis_bshuffle(dr1, dg1); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); dd22 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); ddp[3] = vis_bshuffle(dd02, db1); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); ddp[4] = vis_bshuffle(dd12, db1); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); ddp[5] = vis_bshuffle(dd22, db1); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; ddp += 6; } if (i <= width - 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); db = db1; dr = dr1; dg = dg1; ddp += 3; i += 8; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); db = vis_faligndata(db, db); dg = vis_faligndata(dg, dg); dr = vis_faligndata(dr, dr); dp += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3); dl = dp = dl + rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
static void mlib_v_VectorDistance_S8_Sat_NA( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { mlib_d64 *pdx, dx, dx0, dx1; mlib_d64 *pdy, dy; mlib_s8 *px, *py; mlib_d64 accd, accum; mlib_s32 *pacc; mlib_d64 item; mlib_d64 ones = vis_to_double_dup(0x01000100); /* temporaries used in macro */ mlib_d64 dsrc1h, dsrc1l; /* temporaries used in macro */ mlib_d64 dsrc2h, dsrc2l; /* temporaries used in macro */ mlib_d64 ddiffh, ddiffl; /* temporaries used in macro */ mlib_f32 fdifhh, fdifhl; /* temporaries used in macro */ mlib_f32 fdiflh, fdifll; /* temporaries used in macro */ mlib_d64 dsqrhh, dsqrhl; /* temporaries used in macro */ mlib_d64 dsqrlh, dsqrll; /* temporaries used in macro */ mlib_d64 dsqrh, dsqrl; /* temporaries used in macro */ mlib_d64 dsqr; mlib_s32 off; mlib_s32 i, nd8, nm8; accum = 0.0; px = (mlib_s8 *)x; py = (mlib_s8 *)y; pdy = (mlib_d64 *)((mlib_addr)py & (~7)); off = (mlib_addr)pdy - (mlib_addr)py; if (off != 0) { off += 8; for (i = 0; i < off; i++) { item = (mlib_d64)(*px - *py); accum += item * item; px++; py++; } } nd8 = (n - off) >> 3; nm8 = (n - off) & 0x7; pdx = (mlib_d64 *)vis_alignaddr(px, 0); pdy = (mlib_d64 *)py; accd = 0.0; /* #pragma pipeloop(0) */ for (i = 0; i < nd8; i++) { dx0 = pdx[0]; dx1 = vis_ld_d64_nf(pdx+1); pdx++; dx = vis_faligndata(dx0, dx1); dy = *pdy; pdy++; MLIB_V_VECTORDISTANCE_S8_NA(dx, dy, accd); } fdifhh = vis_fpadd32s(vis_read_hi(accd), vis_read_lo(accd)); pacc = (mlib_s32 *)&fdifhh; accum += (mlib_d64)pacc[0]; if (nm8 != 0) { px += nd8 * 8; py = (mlib_s8 *)pdy; for (i = 0; i < nm8; i++) { item = (mlib_d64)(*px - *py); accum += item * item; px++; py++; } } z[0] = mlib_sqrt(accum); }