mlib_status __mlib_VideoInterpAveX_U8_U8_8x16( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_s32 y; mlib_d64 *dd, ss0[16], *sp1, *sp2, s1hi, s1lo, s2hi, s2lo, s2; mlib_d64 mthree = vis_fone(); mlib_f32 fzero = vis_fzeros(); mlib_f32 fexpd2 = vis_to_float(0x1000200); mthree = vis_fpadd16(mthree, vis_fpadd16(mthree, mthree)); dd = (mlib_d64 *)curr_block; sp1 = (mlib_d64 *)vis_alignaddr((void *)ref_block, 0); #pragma pipeloop(0) MLIB_V_VIDEOCOPY8(16); vis_write_gsr((5 << 3) + ((mlib_s32)(ref_block + 1) & 7)); sp2 = (mlib_d64 *)((mlib_addr)(ref_block + 1) & ~7); #pragma pipeloop(0) MLIB_V_VIDEOINTERPAVG8(16); return (MLIB_SUCCESS); }
static inline void vis_init_consts(void) { vis_set_gsr(7 << VIS_GSR_SCALEFACT_SHIFT); vis_ld64(const_2048[0], CONST_2048); vis_ld64(const_1024[0], CONST_1024); vis_ld64(const_Ugreen[0], CONST_UGREEN); vis_ld64(const_Vgreen[0], CONST_VGREEN); vis_fzeros(ZEROS); vis_ld64(const_Ublue_Vred[0], CONST_UBLUE); vis_ld32(const_Ycoeff[0], CONST_YCOEFF); vis_ld64(const_128[0], CONST_128); }
mlib_status __mlib_VideoAddBlock_U8_S16( mlib_u8 *curr_block, const mlib_s16 *mc_block, mlib_s32 stride) { mlib_s32 y; mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd; mlib_f32 zeros = vis_fzeros(); /* * mlib_s32 mlib_imult = 0x100; * mlib_f32 mult = *(mlib_f32*) & mlib_imult; */ mlib_f32 mult = vis_to_float(0x100); vis_write_gsr(7 << 3); dp = (mlib_d64 *)curr_block; sp = (mlib_d64 *)mc_block; #pragma pipeloop(0) for (y = 0; y < 8; y++) { dd = *dp; s1hi = (*sp++); s1lo = (*sp++); s2hi = vis_fpmerge(zeros, vis_read_hi(dd)); s2lo = vis_fmul8x16al(vis_read_lo(dd), mult); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); *dp = vis_fpack16_pair(s1hi, s1lo); dp = (mlib_d64 *)((mlib_u8 *)dp + stride); } return (MLIB_SUCCESS); }
DEF_FUNC(mlib_ImageBlendColor_U8, mlib_u8, mlib_s32) { mlib_f32 fzeros = vis_fzeros(); mlib_f32 fmax = vis_to_float(0xFFFFFFFF); mlib_d64 dmask = vis_to_double_dup(0x00FF00FF); mlib_d64 done = vis_to_double_dup(0x01000100); mlib_d64 *buffs, *buffd; mlib_d64 *sp, *dp; mlib_f32 *alp_tbl; mlib_d64 ss, s1, rr, tt, d0, d1; mlib_d64 cc, c0, c1, c2; mlib_d64 amask0, amask1, amask2; mlib_s32 ww, dflag, i, j; vis_write_gsr(7 << 3); width *= channel; ww = (width + 7) / 8; if (channel == 3) { ww = 3 * ((ww + 2) / 3); } buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww); if (buffs == NULL) { return (MLIB_FAILURE); } buffd = buffs + ww; if (channel == 4) { cc = DOUBLE_4U16(color[0], color[1], color[2], color[3]); cc = vis_fand(vis_for(cc, ((mlib_d64 *)mlib_dmask_arr)[8 >> alpha]), dmask); alp_tbl = (mlib_f32 *)mlib_alp_tbl + alpha * 256; } else if (channel == 3) {
mlib_status __mlib_VectorConvert_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s32 i; const mlib_s8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 fzero = vis_fzeros(); mlib_s32 len_64, even_length, rest_64, length = n, off; mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7; if (length < 16) { EXPAND(mlib_s8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; off = (mlib_addr)src & 7; if (!off) { /* * Both vectors are 64-bit aligned. */ /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2), fzero), four_16_ones); } } else { /* * Source vector is not 64-bit aligned. * Peeling of 1 iteration. Then loop with step==2. */ vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd4 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i <= len_64; i += 2) { dd0 = dd2; dd1 = vis_ld_d64_nf(ddsrc + i); dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_bshuffle(dd0, dd1); dd6 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); dd7 = vis_faligndata(dd6, dd6); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); (*ddst++) = vis_fmul8sux16(dd6, four_16_ones); (*ddst++) = vis_fmul8sux16(dd7, four_16_ones); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S16_U8_Mod( mlib_s16 *z, const mlib_u8 *x, mlib_s32 n) { mlib_s32 i; const mlib_u8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_f32 fzero = vis_fzeros(); mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 fm = vis_to_float(0x100); if (length < 16) { EXPAND(mlib_u8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; if (!((mlib_addr)src & 7)) { /* * Both vectors are 64-bit aligned. We can process without * vis_faligndata * Peeling the 1 iteration. Then loop with step==2. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling the 1 iteration. Then loop with step==2. */ i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd3 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); } #pragma pipeloop(0) #pragma unroll(2) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
mlib_status mlib_ImageMulAlpha_U8( mlib_u8 *sl, mlib_u8 *dl, mlib_s32 sstride, mlib_s32 dstride, mlib_s32 width, mlib_s32 height, mlib_s32 channel, mlib_s32 alpha) { mlib_f32 fzeros = vis_fzeros(); mlib_d64 dmask = vis_to_double_dup(0x00FF00FF); mlib_d64 done = vis_to_double_dup(0x01000100); mlib_d64 *buffs, *buffd; mlib_d64 *sp, *dp; mlib_d64 ss, s1, rr, d0, d1; mlib_d64 amask0, amask1, amask2; mlib_s32 ww, dflag, cmask, i, j; vis_write_gsr(7 << 3); width *= channel; ww = (width + 7) / 8; if (channel == 3) { ww = 3 * ((ww + 2) / 3); } buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww); if (buffs == NULL) { return (MLIB_FAILURE); } buffd = buffs + ww; if (channel == 4) { cmask = 1 << (3 - alpha); cmask |= (cmask << 4); } else if (channel == 3) { amask0 = ((mlib_d64 *)mlib_amask3_arr)[alpha]; amask1 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 1]; amask2 = ((mlib_d64 *)mlib_amask3_arr)[alpha + 2]; } for (j = 0; j < height; j++) { if (((int)sl & 7)) { MEM_COPY(sl, buffs, width); sp = buffs; } else { sp = (mlib_d64 *)sl; } dflag = 0; if (((int)dl | width) & 7) { dp = buffd; dflag = 1; } else { dp = (mlib_d64 *)dl; } if (channel == 4) { mlib_d64 a0, a1; if (alpha == 0) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(hi, au); } } else if (alpha == 1) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(hi, al); } } else if (alpha == 2) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(lo, au); } } else { /* if (alpha == 3) */ #pragma pipeloop(0) for (i = 0; i < ww; i++) { MUL_ALPHA_4CH(lo, al); } } } else if (channel == 3) { mlib_d64 s0, s1, s2; mlib_d64 a0, a1, a2; mlib_s32 cmask0, cmask1, cmask2; cmask0 = 0x492 >> alpha; cmask1 = 0x492 >> (alpha + 1); cmask2 = 0x492 >> (alpha + 2); if (alpha == 0) { vis_alignaddr((void *)0, 7); #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_0(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_0_NF(); MUL_ALPHA_3CH(); } } else if (alpha == 1) { mlib_d64 b0, b1, b2; #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_1(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_1_NF(); MUL_ALPHA_3CH(); } } else { /* if (alpha == 2) */ vis_alignaddr((void *)0, 1); #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { LOAD_3CH_2(); MUL_ALPHA_3CH(); } if (i < ww) { LOAD_3CH_2_NF(); MUL_ALPHA_3CH(); } } } else { /* if (channel == 2) */ if (alpha == 0) {
mlib_status __mlib_MatrixMul_S16_S8_Mod( mlib_s16 *z, const STYPE * x, const STYPE * y, mlib_s32 m, mlib_s32 l, mlib_s32 n) { mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y; mlib_d64 array[MAX_SIZE]; mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3; mlib_s32 size, i, j, k, l8; if (!((m > 0) && (l > 0) && (n > 0))) { return (MLIB_FAILURE); } l8 = (l + 7) / 8; size = l8 * n + 2 * l8 + 4; if (size <= MAX_SIZE) { buff_y = array; } else { buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64)); if (buff_y == NULL) { return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat, x, y, m, l, n, n, z); } } buff_x = buff_y + l8 * n; pbuff_y = buff_y; /* transpose y matrix */ for (i = 0; i < n; i++) { mlib_u8 *py = (mlib_u8 *)y + i; mlib_u8 *pp = (mlib_u8 *)pbuff_y; for (j = 0; j <= (l - 4); j += 4) { ((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080; ((mlib_s16 *)pp)[1] = ((py[2 * n] << 8) | py[3 * n]) ^ 0x8080; py += 4 * n; pp += 4; } for (; j < l; j++) { (*pp++) = *py ^ 0x80; py += n; } for (; j < 8 * l8; j++) { (*pp++) = 0; } pbuff_y += l8; } for (j = 0; j < m; j++) { mlib_s32 x_sum = 0; for (i = 0; i < l; i++) { x_sum += x[i]; } x_sum <<= 7; pbuff_x = buff_x; pbuff_y = buff_y; /* copy x line */ px = vis_alignaddr((void *)x, 0); x1 = vis_ld_d64_nf(px); px++; xx = 0; for (i = 0; i < l8; i++) { x0 = x1; x1 = vis_ld_d64_nf(px); px++; xx = vis_faligndata(x0, x1); pbuff_x[2 * i] = vis_fpmerge(vis_read_hi(xx), vis_fzeros()); pbuff_x[2 * i + 1] = vis_fpmerge(vis_read_lo(xx), vis_fzeros()); } /* loop on y lines */ for (i = 0; i < n; i += 2) { mlib_d64 *px = pbuff_x; mlib_d64 *py0 = pbuff_y; mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0; ds0 = ds1 = vis_fzero(); LOAD; MUL; LOAD; #pragma pipeloop(0) for (k = 0; k < l8; k++) { SUM; MUL; LOAD; } ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0), vis_read_lo(ds0)), vis_fpadd16s(vis_read_hi(ds1), vis_read_lo(ds1))); z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] - x_sum; if (i + 1 < n) { z[i + 1] = ((mlib_s16 *)&ds0)[2] + ((mlib_s16 *)&ds0)[3] - x_sum; } pbuff_y += 2 * l8; } z += n; x += l; } if (size > MAX_SIZE) { __mlib_free(buff_y); } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorNorm_U8_Sat( mlib_d64 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *pxend, *px = (mlib_u8 *)x; mlib_d64 *dpx, *dpxend; mlib_d64 sum = 0.0; mlib_d64 dx, dr1, dr2, dr3, dr4, dr5, dr6; mlib_d64 ds, ds1; mlib_d64 edge[2]; mlib_f32 fone = vis_to_float(0x100); mlib_f32 fzero = vis_fzeros(); mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask; if (n <= 0) return (MLIB_FAILURE); edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); pxend = px + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds = ds1 = 0.0; for (; d_left > 0; d_left--) { NORM_U8; SUM_U8; dpx++; dx = dpx[0]; } ds = vis_fpadd32(ds, ds1); fsum = vis_fpadd32s(vis_read_hi(ds), vis_read_lo(ds)); sum += (mlib_d64)*((mlib_s32 *)&fsum); } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; NORM_U8; ds = vis_fpadd32(dr3, dr4); ds1 = vis_fpadd32(dr5, dr6); ds = vis_fpadd32(ds, ds1); fsum = vis_fpadd32s(vis_read_hi(ds), vis_read_lo(ds)); sum += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = mlib_sqrt(sum); return (MLIB_SUCCESS); #undef MAX_LOOP }
mlib_status __mlib_VectorConvert_U8_S8_Sat( mlib_u8 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i, off; mlib_s8 c; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 zero = vis_fzeros(); if (length < 16) { PACK_S_U(mlib_s8, mlib_u8); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); off = (mlib_addr)src & 7; vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (i; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); d4 = vis_fmul8sux16(d6, four_16_ones); d6 = vis_faligndata(d6, d6); d5 = vis_fmul8sux16(d6, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 fzero = vis_fzeros(); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_u8 c; mlib_d64 dsp = vis_to_double_dup(0x800080); mlib_d64 rst = vis_to_double_dup(0x80808080); mlib_f32 fm = vis_to_float(0x100); if (length < 16) { PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d4 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); d2 = vis_fpmerge(fzero, vis_read_hi(d4)); d3 = vis_fmul8x16al(vis_read_lo(d4), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); (*ddst++) = vis_fxor(d4, rst); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d1 = vis_faligndata(d1, d2); d3 = vis_fmul8x16al(vis_read_hi(d1), fm); d4 = vis_fmul8x16al(vis_read_lo(d1), fm); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpadd16(dsp, d4); d1 = vis_fpack16_pair(d3, d4); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_faligndata(d1, d2); d4 = vis_fmul8x16al(vis_read_hi(d3), fm); d5 = vis_fmul8x16al(vis_read_lo(d3), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d3 = vis_fpack16_pair(d4, d5); d4 = vis_fmul8x16al(vis_read_hi(d6), fm); d5 = vis_fmul8x16al(vis_read_lo(d6), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d6 = vis_fpack16_pair(d4, d5); (*ddst++) = vis_fxor(d3, rst); (*ddst++) = vis_fxor(d6, rst); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c; return (MLIB_SUCCESS); }
mlib_status FUNC( MxN) ( mlib_image *dst, const mlib_image *src, const mlib_s32 **dmask, mlib_s32 m, mlib_s32 n, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; const mlib_s32 *dmask0 = dmask[0], *dmask1 = dmask[1], *dmask2 = dmask[2]; mlib_s32 method = mlib_ImageGetMethod(colormap); mlib_u8 *sl, *dl; mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh, num_blk; mlib_s32 off, off1, kw, mstep, line_size, kern_size, xsize8, i, j, k; mlib_d64 *pbuff; mlib_u8 *p_dim; mlib_s16 *kern, *pkern; mlib_d64 *dkern; mlib_d64 dscale, dscale0, dscale1, dscale2; mlib_d64 ss, d0, d1; mlib_f32 fzeros = vis_fzeros(); mlib_s32 step0, half_step0, v0; mlib_s32 bit_offset = mlib_ImageGetBitOffset(dst); mlib_u8 *p_lut; MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl); p_lut = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap); step0 = abs(p_lut[1] - p_lut[0]); num_blk = (sw + (m - 1)) / m; mstep = m * NCHAN; line_size = (mstep * num_blk + 7) & ~7; xsize8 = (NCHAN * sw + 7) / 8; dscale = 1.0; while (scale > 30) { dscale *= 1.0 / (1 << 30); scale -= 30; } dscale /= (1 << scale); dscale0 = dscale * step0; half_step0 = (step0 - 1) >> 1; kern_size = n * line_size; kern = __mlib_malloc(kern_size * sizeof (mlib_s16)); if (kern == NULL) return (MLIB_FAILURE); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { pkern = kern + j * line_size + i; v0 = half_step0 - (mlib_s32)(dmask0[j * m + i] * dscale0); for (k = 0; k < num_blk; k++) { pkern[k * mstep] = v0; } } } pbuff = __mlib_malloc(xsize8 * sizeof (mlib_d64) + 16); if (pbuff == NULL) { __mlib_free(kern); return (MLIB_FAILURE); } pkern = kern; vis_write_gsr(7 << 3); for (j = 0; j < sh; j++) { dkern = (mlib_d64 *)pkern; if ((mlib_s32)sl & 7) { mlib_u8 *sp = sl; #pragma pipeloop(0) for (i = 0; i < xsize8; i++) { LOAD_NA_NF(ss, sp); d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_hi(ss)), dkern[2 * i]); d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_lo(ss)), dkern[2 * i + 1]); pbuff[i] = vis_fpack16_pair(d0, d1); sp += 8; } } else { mlib_d64 *sp = (mlib_d64 *)sl; #pragma pipeloop(0) for (i = 0; i < xsize8; i++) { ss = sp[i]; d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_hi(ss)), dkern[2 * i]); d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_lo(ss)), dkern[2 * i + 1]); pbuff[i] = vis_fpack16_pair(d0, d1); } } pkern += line_size; if (pkern >= kern + kern_size) pkern = kern; mlib_ImageColorTrue2IndexLine_U8_BIT_1((mlib_u8 *)pbuff, dl, bit_offset, sw, colormap); sl += sll; dl += dll; } __mlib_free(pbuff); __mlib_free(kern); return (MLIB_SUCCESS); }