void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, dmask, dFF; mlib_s32 i, i0, j, x, mask; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } dmask = vis_to_double_dup(0xFFFFFF); dFF = vis_to_double_dup(0xFFFFFFFF); for (j = 0; j < height; j++) { mlib_s32 *src = srcBase; mlib_s32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { x = src[i]; dst[i] = (x << 7) >> 7; i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { mlib_u8 *pp0 = (mlib_u8*)(src + i); mlib_u8 *pp1 = (mlib_u8*)(src + i + 1); dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1); dd = vis_fand(dd, dmask); #if 1 mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3); *(mlib_d64*)(dst + i) = dd; vis_pst_8(dFF, dst + i, mask); #else mask = ((*pp0 & 1) << 1) | (*pp1 & 1); dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]); *(mlib_d64*)(dst + i) = dd; #endif } if (i < width) { x = src[i]; dst[i] = (x << 7) >> 7; }
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, mask; mlib_s32 i, i0, j; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } mask = vis_to_double_dup(0xFF000000); vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u32 *src = srcBase; mlib_u32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { dst[i] = 0xff000000 | (src[i] >> 8); i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]); dd = vis_faligndata(dd, dd); *(mlib_d64*)(dst + i) = vis_for(dd, mask); } if (i < width) { dst[i] = 0xff000000 | (src[i] >> 8); }
mlib_status __mlib_VideoColorMerge3_S16( mlib_s16 *colors, const mlib_s16 *color1, const mlib_s16 *color2, const mlib_s16 *color3, mlib_s32 n) { mlib_d64 *dp = (mlib_d64 *)colors; mlib_d64 *sp0 = (mlib_d64 *)color1; mlib_d64 *sp1 = (mlib_d64 *)color2; mlib_d64 *sp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, sd3, sd4, sd5; mlib_d64 dd0, dd1, dd2, dd3, dd4, dd5; mlib_s32 i; #pragma pipeloop(1) for (i = 0; i <= (n - 8); i += 8) { sd0 = sp0[0]; sd1 = sp1[0]; sd2 = sp2[0]; sd3 = sp0[1]; sd4 = sp1[1]; sd5 = sp2[1]; vis_write_bmask(0x018923ab, 0); dd0 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(sd1, sd2); dd2 = vis_bshuffle(sd3, sd4); dd5 = vis_bshuffle(sd4, sd5); vis_write_bmask(0x45cd67ef, 0); dd1 = vis_bshuffle(sd0, sd1); dd4 = vis_bshuffle(sd3, sd4); vis_write_bmask(0x01238945, 0); dp[0] = vis_bshuffle(dd0, sd2); dp[3] = vis_bshuffle(dd2, sd5); dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1)); dp[4] = vis_freg_pair(vis_read_lo(dd5), vis_read_hi(dd4)); vis_write_bmask(0xcd4567ef, 0); dp[2] = vis_bshuffle(dd1, sd2); dp[5] = vis_bshuffle(dd4, sd5); sp0 += 2; sp1 += 2; sp2 += 2; dp += 6; } if (i <= (n - 4)) { sd0 = sp0[0]; sd1 = sp1[0]; sd2 = sp2[0]; vis_write_bmask(0x018923ab, 0); dd0 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(sd1, sd2); vis_write_bmask(0x45cd67ef, 0); dd1 = vis_bshuffle(sd0, sd1); vis_write_bmask(0x01238945, 0); dp[0] = vis_bshuffle(dd0, sd2); dp[1] = vis_freg_pair(vis_read_lo(dd3), vis_read_hi(dd1)); vis_write_bmask(0xcd4567ef, 0); dp[2] = vis_bshuffle(dd1, sd2); sp0++; sp1++; sp2++; dp += 3; } for (; i < n; i++) { colors[3 * i] = ((mlib_u16 *)color1)[i]; colors[3 * i + 1] = ((mlib_u16 *)color2)[i]; colors[3 * i + 2] = ((mlib_u16 *)color3)[i]; } return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorMerge_S32C_S32( mlib_s32 *z, const mlib_s32 *rr, const mlib_s32 *ii, mlib_s32 n) { /* pointer to real source vector */ mlib_u32 *src_r = (mlib_u32 *)rr; /* pointer to imaginary source vector */ mlib_u32 *src_i = (mlib_u32 *)ii; /* pointer to resultant complex vector */ mlib_u32 *dst = (mlib_u32 *)z; mlib_s32 len = n + n, odd = 0, i; mlib_d64 d1, d2, d3, d4; vis_write_bmask(0x012389ab, 0); if (n < 4) { MERGE(mlib_s32, rr, ii, n, z); } if (((mlib_addr)dst) & 7) { (*dst++) = (*src_r++); len--; odd = 1; } if (!(((mlib_addr)src_r ^ (mlib_addr)src_i) & 7)) { if (odd) { if (((mlib_addr)src_i & 7) && (len >= 2)) { (*dst++) = (*src_i++); (*dst++) = (*src_r++); len -= 2; } #pragma pipeloop(0) for (i = 0; i <= (len - 4); i += 4) { d1 = *((mlib_d64 *)src_i); src_i += 2; d2 = *((mlib_d64 *)src_r); src_r += 2; MERGE32; ((mlib_d64 *)dst)[0] = d3; ((mlib_d64 *)dst)[1] = d4; dst += 4; } if (i <= len - 2) { (*dst++) = (*src_i++); (*dst++) = (*src_r++); } (*dst++) = (*src_i++); } else { if ((mlib_addr)src_i & 7) { (*dst++) = (*src_r++); (*dst++) = (*src_i++); len -= 2; } #pragma pipeloop(0) for (i = 0; i <= (len - 4); i += 4) { d1 = *((mlib_d64 *)src_r); src_r += 2; d2 = *((mlib_d64 *)src_i); src_i += 2; MERGE32; ((mlib_d64 *)dst)[0] = d3; ((mlib_d64 *)dst)[1] = d4; dst += 4; } if (i <= len - 2) { (*dst++) = (*src_r++); (*dst++) = (*src_i++); } } } else { mlib_f32 fsrc_r, fsrc_i; if (odd) { #pragma pipeloop(0) for (i = 0; i <= (len - 2); i += 2) { fsrc_r = *((mlib_f32 *)src_r); src_r++; fsrc_i = *((mlib_f32 *)src_i); src_i++; d1 = vis_freg_pair(fsrc_i, fsrc_r); ((mlib_d64 *)dst)[0] = d1; dst += 2; } (*dst++) = (*src_i++); } else { mlib_f32 fsrc_r, fsrc_i; #pragma pipeloop(0) for (i = 0; i < len; i += 2) { fsrc_r = *((mlib_f32 *)src_r); src_r++; fsrc_i = *((mlib_f32 *)src_i); src_i++; d1 = vis_freg_pair(fsrc_r, fsrc_i); ((mlib_d64 *)dst)[0] = d1; dst += 2; } } } return ((n > 0) ? MLIB_SUCCESS : MLIB_FAILURE); }
mlib_status mlib_v_conv3x3_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2, d21, d22; /* data */ mlib_d64 d3, d_3, d23; mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1, sd00; mlib_s32 emask, cmask1; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); cmask = ((cmask & 0xf) << 4) + (cmask & 0xf); cmask = (cmask << 8) + (cmask); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 2; dw *= 4; dh -= 2; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; d_a = adr_dst + dlb + 4; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf3, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); vis_alignaddr(s1, 4); d1 = *s1; d2 = *s2; d3 = *s3; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k3k4); CONV_AU(d3, k7k8); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); d23 = vis_faligndata(d3, d_3); CONV_AL(d21, k1k2); CONV_AU(d22, k5k6); CONV_AL(d23, k7k8); CONV_AU(d_1, k3k4); CONV_AL(d_2, k5k6); CONV_AU(d_3, k9k9); (*ddst++) = vis_fpack16_pair(out0, out1); d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = dbuf; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); i = (mlib_addr)dp - (mlib_addr)da; cmask1 = cmask >> (-i); ddst = vis_alignaddr(ddst, i); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); sd1 = ddst[0]; if (emask != 0xff) { sd0 = sd1; sd1 = ddst[1]; sd0 = vis_faligndata(sd0, sd1); vis_pst_8(sd0, dp++, emask & cmask1); ddst++; i += 8; } #pragma pipeloop(0) for (; i <= (dw - 8); i += 8) { sd0 = sd1; sd1 = ddst[1]; sd00 = vis_faligndata(sd0, sd1); vis_pst_8(sd00, dp++, cmask1); ddst++; } if (i < dw) { sd0 = vis_faligndata(sd1, ddst[1]); emask = vis_edge8(dp, dend); vis_pst_8(sd0, dp, emask & cmask1); } sa2 = sa2 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
void mlib_v_ImageChannelExtract_32_32( const mlib_f32 *src, mlib_s32 slb, mlib_f32 *dst, mlib_s32 dlb, mlib_s32 width, mlib_s32 height, mlib_s32 cmask) { mlib_d64 *sp, *dp; mlib_f32 *sa, *sl, *da, *dl, *dend; mlib_d64 sd0, sd1, sd2, sd3, dd0; mlib_s32 soff, xsize, cmask1, emask; mlib_s32 i, j; if (width <= 0) return; if ((8 * width == dlb) && (2 * slb == 3 * dlb)) { width *= height; height = 1; } width *= 2; if (cmask == 3) { src += 1; cmask = 6; } sa = sl = (void *)src; da = dl = dst; for (j = 0; j < height; j++) { cmask1 = cmask; xsize = width; dend = da + width - 1; if (((mlib_addr)da & 7) != 0) { (*da++) = *sa; sa++; xsize -= 1; cmask1 = ((cmask1 << 1) + 1) & 7; } dp = (mlib_d64 *)da; sp = (mlib_d64 *)((mlib_addr)sa & (~7)); soff = (sa - (mlib_f32 *)sp) & 1; if (cmask1 != 5) { if (cmask1 == 3) { cmask1 <<= soff; sp += soff; } else cmask1 >>= soff; vis_write_bmask(0x456789AB, 0); if (cmask1 == 3) { #pragma pipeloop(0) for (i = 0; i < xsize - 3; i += 4) { sd0 = (*sp++); sd1 = (*sp++); (*dp++) = vis_bshuffle(sd0, sd1); (*dp++) = (*sp++); } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd0 = vis_ld_d64_nf(sp); sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_bshuffle(sd0, sd1); vis_pst_32(dd0, dp++, emask); if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); dd0 = vis_ld_d64_nf(sp + 2); vis_pst_32(dd0, dp, emask); } } } else { #pragma pipeloop(0) for (i = 0; i < xsize - 3; i += 4) { (*dp++) = (*sp++); sd0 = (*sp++); sd1 = (*sp++); (*dp++) = vis_bshuffle(sd0, sd1); } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); dd0 = vis_ld_d64_nf(sp); vis_pst_32(dd0, dp++, emask); if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd0 = vis_ld_d64_nf(sp + 1); sd1 = vis_ld_d64_nf(sp + 2); dd0 = vis_bshuffle(sd0, sd1); vis_pst_32(dd0, dp, emask); } } } } else { if (soff == 0) { vis_write_bmask(0x012389AB, 0); #pragma pipeloop(0) for (i = 0; i < xsize - 3; i += 4) { sd0 = (*sp++); sd1 = (*sp++); sd2 = (*sp++); (*dp++) = vis_bshuffle(sd0, sd1); (*dp++) = vis_freg_pair(vis_read_lo(sd1), vis_read_lo(sd2)); } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd0 = vis_ld_d64_nf(sp); sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_bshuffle(sd0, sd1); vis_pst_32(dd0, dp++, emask); if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd2 = vis_ld_d64_nf(sp + 2); dd0 = vis_freg_pair(vis_read_lo (sd1), vis_read_lo(sd2)); vis_pst_32(dd0, dp, emask); } } } else { vis_write_bmask(0x4567CDEF, 0); sd0 = vis_ld_d64_nf(sp); sp++; #pragma pipeloop(0) for (i = 0; i < xsize - 3; i += 4) { sd1 = (*sp++); sd2 = (*sp++); sd3 = (*sp++); (*dp++) = vis_bshuffle(sd0, sd1); (*dp++) = vis_freg_pair(vis_read_hi(sd2), vis_read_hi(sd3)); sd0 = sd3; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd1 = vis_ld_d64_nf(sp); dd0 = vis_bshuffle(sd0, sd1); vis_pst_32(dd0, dp++, emask); if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge32(dp, dend); sd2 = vis_ld_d64_nf(sp + 1); sd3 = vis_ld_d64_nf(sp + 2); dd0 = vis_freg_pair(vis_read_hi (sd2), vis_read_hi(sd3)); vis_pst_32(dd0, dp, emask); } } } } sa = sl = (mlib_f32 *)((mlib_u8 *)sl + slb); da = dl = (mlib_f32 *)((mlib_u8 *)dl + dlb); }
mlib_status __mlib_MatrixMul_S16_S8_Mod( mlib_s16 *z, const STYPE * x, const STYPE * y, mlib_s32 m, mlib_s32 l, mlib_s32 n) { mlib_d64 *px, *buff_x, *buff_y, *pbuff_x, *pbuff_y; mlib_d64 array[MAX_SIZE]; mlib_d64 xx, x0, x1, y0, y1, ds0, ds1, dr0, dr1, dr2, dr3; mlib_s32 size, i, j, k, l8; if (!((m > 0) && (l > 0) && (n > 0))) { return (MLIB_FAILURE); } l8 = (l + 7) / 8; size = l8 * n + 2 * l8 + 4; if (size <= MAX_SIZE) { buff_y = array; } else { buff_y = (mlib_d64 *)__mlib_malloc(size * sizeof (mlib_d64)); if (buff_y == NULL) { return mlib_MatrixMul_type(type_U8, type_U8, mode_Sat, x, y, m, l, n, n, z); } } buff_x = buff_y + l8 * n; pbuff_y = buff_y; /* transpose y matrix */ for (i = 0; i < n; i++) { mlib_u8 *py = (mlib_u8 *)y + i; mlib_u8 *pp = (mlib_u8 *)pbuff_y; for (j = 0; j <= (l - 4); j += 4) { ((mlib_s16 *)pp)[0] = ((py[0] << 8) | py[n]) ^ 0x8080; ((mlib_s16 *)pp)[1] = ((py[2 * n] << 8) | py[3 * n]) ^ 0x8080; py += 4 * n; pp += 4; } for (; j < l; j++) { (*pp++) = *py ^ 0x80; py += n; } for (; j < 8 * l8; j++) { (*pp++) = 0; } pbuff_y += l8; } for (j = 0; j < m; j++) { mlib_s32 x_sum = 0; for (i = 0; i < l; i++) { x_sum += x[i]; } x_sum <<= 7; pbuff_x = buff_x; pbuff_y = buff_y; /* copy x line */ px = vis_alignaddr((void *)x, 0); x1 = vis_ld_d64_nf(px); px++; xx = 0; for (i = 0; i < l8; i++) { x0 = x1; x1 = vis_ld_d64_nf(px); px++; xx = vis_faligndata(x0, x1); pbuff_x[2 * i] = vis_fpmerge(vis_read_hi(xx), vis_fzeros()); pbuff_x[2 * i + 1] = vis_fpmerge(vis_read_lo(xx), vis_fzeros()); } /* loop on y lines */ for (i = 0; i < n; i += 2) { mlib_d64 *px = pbuff_x; mlib_d64 *py0 = pbuff_y; mlib_d64 *py1 = (i + 1 < n) ? (py0 + l8) : py0; ds0 = ds1 = vis_fzero(); LOAD; MUL; LOAD; #pragma pipeloop(0) for (k = 0; k < l8; k++) { SUM; MUL; LOAD; } ds0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(ds0), vis_read_lo(ds0)), vis_fpadd16s(vis_read_hi(ds1), vis_read_lo(ds1))); z[i] = ((mlib_s16 *)&ds0)[0] + ((mlib_s16 *)&ds0)[1] - x_sum; if (i + 1 < n) { z[i + 1] = ((mlib_s16 *)&ds0)[2] + ((mlib_s16 *)&ds0)[3] - x_sum; } pbuff_y += 2 * l8; } z += n; x += l; } if (size > MAX_SIZE) { __mlib_free(buff_y); } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S16_S32_Sat( mlib_s16 *z, const mlib_s32 *x, mlib_s32 n) { mlib_s32 *src = (void *)x; mlib_s16 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8; mlib_s32 c; mlib_s32 len_64, even_length, rest_64, length = n, i; if (n < 16) { PACK_S_S(mlib_s32, mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN); } /* * First try to align destination address for 8 bytes. */ while ((mlib_addr)dst & 7) { (*dst++) = (c = *src) > MLIB_S16_MAX ? MLIB_S16_MAX : (c < MLIB_S16_MIN ? MLIB_S16_MIN : c); src++; length--; } vis_write_gsr(16 << 3); rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; ddst = (mlib_d64 *)dst; if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = (*dsrc++); (*ddst++) = vis_freg_pair(vis_fpackfix(d1), vis_fpackfix(d2)); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = (*dsrc++); (*ddst++) = vis_freg_pair(vis_fpackfix(d1), vis_fpackfix(d2)); (*ddst++) = vis_freg_pair(vis_fpackfix(d3), vis_fpackfix(d4)); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d4 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d4; d2 = (*dsrc++); d4 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d1, d2); d6 = vis_faligndata(d2, d4); (*ddst++) = vis_freg_pair(vis_fpackfix(d5), vis_fpackfix(d6)); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d0 = d4; d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d0, d1); d6 = vis_faligndata(d1, d2); d7 = vis_faligndata(d2, d3); d8 = vis_faligndata(d3, d4); (*ddst++) = vis_freg_pair(vis_fpackfix(d5), vis_fpackfix(d6)); (*ddst++) = vis_freg_pair(vis_fpackfix(d7), vis_fpackfix(d8)); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c > MLIB_S16_MAX ? MLIB_S16_MAX : (c < MLIB_S16_MIN ? MLIB_S16_MIN : c); } return (MLIB_SUCCESS); }
mlib_status __mlib_SignalEmphasize_S16S_S16S_Sat( mlib_s16 *dst, const mlib_s16 *src, void *filter, mlib_s32 n) { mlib_emphasize_struct *fist = filter; mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF); mlib_d64 w_maskor0 = vis_freg_pair(0.f, fist->v16_last0); mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000); mlib_d64 w_maskor1 = vis_freg_pair(0.f, fist->v16_last1); mlib_f32 v_mask = vis_to_float(0x80008000); mlib_f32 v_alpha = fist->v_alpha; mlib_s16 *fdst = dst + n + n - 1; mlib_d64 *dpd, *dps, *dsrct1; mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc; mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7; mlib_s32 i, times, t1, t2; /* check for obvious errors */ if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) || (fist->type != MLIB_EMPH)) { return (MLIB_FAILURE); } vis_write_gsr(1 << 3); w_maskor0 = vis_fand(w_maskor0, w_maskand1); w_maskor1 = vis_fand(w_maskor1, w_maskand0); vis_alignaddr((void *)(-(mlib_addr)src), 0); w_maskand0 = vis_faligndata(w_maskand0, w_maskand0); w_maskor0 = vis_faligndata(w_maskor0, w_maskor0); w_maskand1 = vis_faligndata(w_maskand1, w_maskand1); w_maskor1 = vis_faligndata(w_maskor1, w_maskor1); dpd = vis_alignaddr(dst, 0); times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd; t1 = -((mlib_addr)(dst) & 7); t2 = t1 - 4; dps = vis_alignaddr((void *)src, t2); w_src0 = vis_ld_d64_nf(dps); dps++; w_src1 = vis_ld_d64_nf(dps); dps++; if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) { if (((mlib_addr)dps - (mlib_addr)src) >= 6) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } if (((mlib_addr)dps - (mlib_addr)src) >= 8) { w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); } else { w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); } w_lsrc = vis_faligndata(w_src0, w_src1); dsrct1 = vis_alignaddr((void *)src, t1); if (dps - 2 != dsrct1) { w_src2 = *dps; dps++; w_src = vis_faligndata(w_src1, w_src2); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = w_src2; w_src2 = *dps; vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dpd++; dps++; } } else { w_src = vis_faligndata(w_src0, w_src1); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; dpd++; } } } else { w_src = w_src1; if ((mlib_addr)src & 7) { times--; if (((mlib_addr)src & 7) == 2) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; w_src0 = w_src1; w_src1 = *dps; w_src = w_src1; w_lsrc = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } else { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; } w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); MLIB_MIX; w_src1 = w_src; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_src1 = w_src; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); w_dst = vis_fpackfix_pair(dr2, dr3); dps++; dpd++; } } if (times >= 0) { vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst)); } ((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2]; ((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1]; return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_mask( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *dend, *adr_src; /* pointers to src rows */ mlib_u8 *sa, *sa2, *sa3, *sa4, *sa5, *sa6, *sa_6, *prow; /* pointers to rows in interm. src buf */ mlib_u8 *buff_src, *sbuf1, *sbuf2, *sbuf3, *sbuf4, *sbuf5, *s_buf1; /* pointers to row in interm. dst buf */ mlib_u8 *dbuf, *d_buf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst, *ddst1; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, rnd; /* data */ mlib_d64 d1, d2, d3, d4, d5, d_1, d_2, d_3, d_4, d_5; /* temp. data, used in faligndata */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; /* shifted data */ mlib_d64 d21, d22, d23, d24, d25; mlib_f32 k1k2, k17k18, k19k20, k21k22, k23k24, k25; mlib_f32 k3k4, k5k6, k7k8, k9k10, k11k12, k13k14, k15k16; mlib_s32 rval, gsr_scale, i, j, nchannel, nchannel1, chan, testchan; /* temp, used in load-store */ mlib_s32 t1, t2, t3, t4, t5, t6, t7, t8, tt1, tt2, tt3, tt4, tt5, tt6, tt7, tt8; adr_src = mlib_ImageGetData(src); adr_dst = mlib_ImageGetData(dst); nchannel = mlib_ImageGetChannels(src); slb = mlib_ImageGetStride(src); dlb = mlib_ImageGetStride(dst); dh = mlib_ImageGetHeight(dst); dw = mlib_ImageGetWidth(dst); /* buf_slb - 8-byte aligned */ buf_slb = (dw + 16) & (~7); /* alloc. interm. src and dst buffer */ buff_src = (mlib_u8 *)__mlib_malloc(7 * buf_slb * sizeof (mlib_u8) + 8); if (buff_src == NULL) return (MLIB_FAILURE); /* edge - no write */ dw -= 4; dh -= 4; /* * The 8x16 mult has built-in 8-bit R shift, and fpack16 has 7-bit * fixed R shift (preceded by variable-bit L shift controlled by GSR * scalefactor field). Thus net R shift = (8+7)-(GSR.scalefactor_field), * so GSR.scalefactor_field = 15-(net R shift): */ gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3) + 1); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); sbuf1 = (mlib_u8 *)((mlib_addr)(buff_src + 8) & (~7)); sbuf2 = sbuf1 + buf_slb; sbuf3 = sbuf2 + buf_slb; sbuf4 = sbuf3 + buf_slb; sbuf5 = sbuf4 + buf_slb; dbuf = sbuf5 + buf_slb; LOAD_KERNEL_INTO_FLOAT(); testchan = 1; for (chan = nchannel - 1; chan >= 0; chan--) { if ((cmask & testchan) == 0) { testchan <<= 1; continue; } testchan <<= 1; sa = adr_src + chan; sa2 = sa + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; sa5 = sa4 + slb; sa_6 = sa6 = sa5 + slb; d_a = adr_dst + (dlb << 1) + (nchannel << 1) + chan; /* load interm. src buff */ for (i = 0, j = 0; j < (dw + 4); i += nchannel, j++) { sbuf1[j] = sa5[i]; sbuf2[j] = sa[i]; sbuf3[j] = sa2[i]; sbuf4[j] = sa3[i]; sbuf5[j] = sa4[i]; } for (j = 0; j < dh - 1; j++) { ddst1 = ddst = (mlib_d64 *)(dbuf); d_buf = (dbuf - 8); da = d_a; dend = da + (dw - 1) * nchannel; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = sbuf3; sbuf3 = sbuf4; sbuf4 = sbuf5; sbuf5 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; s3 = (mlib_d64 *)sbuf3; s4 = (mlib_d64 *)sbuf4; s5 = (mlib_d64 *)sbuf5; s_buf1 = sbuf1; d1 = *s1; d2 = *s2; d3 = *s3; nchannel1 = 0; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; t1 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AU(d1, k1k2); t2 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AL(d2, k5k6); t3 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AU(d3, k11k12); t4 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d21 = vis_faligndata(d1, d_1); dt_1 = vis_faligndata(d_1, d1); t5 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d22 = vis_faligndata(d2, d_2); dt_2 = vis_faligndata(d_2, d2); t6 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d23 = vis_faligndata(d3, d_3); t7 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; dt_3 = vis_faligndata(d_3, d3); t8 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AL(d21, k1k2); (*s_buf1++) = t1; CONV_AU(d22, k7k8); (*s_buf1++) = t2; CONV_AL(d23, k11k12); (*s_buf1++) = t3; SHIFT_U8_1; CONV_AU(d21, k3k4); (*s_buf1++) = t4; CONV_AL(d22, k7k8); CONV_AU(d23, k13k14); d21 = vis_faligndata(d21, dt_1); d22 = vis_faligndata(d22, dt_2); (*s_buf1++) = t5; d23 = vis_faligndata(d23, dt_3); CONV_AL(d21, k3k4); (*s_buf1++) = t6; CONV_AU(d22, k9k10); (*s_buf1++) = t7; CONV_AL(d23, k13k14); d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1)); CONV_AU(d21, k5k6); d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2)); CONV_AL(d22, k9k10); d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3)); CONV_AU(d23, k15k16); (*s_buf1++) = t8; ddst[0] = out0; ddst[1] = out1; ddst += 2; d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = (mlib_d64 *)(dbuf); d4 = *s4; d5 = *s5; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_4 = *(s4 + 1); d_5 = *(s5 + 1); out0 = ddst[0]; out1 = ddst[1]; ddst += 2; tt1 = (*d_buf++); CONV_AL(d4, k15k16); tt2 = (*d_buf++); CONV_AU(d5, k21k22); d24 = vis_faligndata(d4, d_4); tt3 = (*d_buf++); dt_4 = vis_faligndata(d_4, d4); d25 = vis_faligndata(d5, d_5); tt4 = (*d_buf++); dt_5 = vis_faligndata(d_5, d5); tt5 = (*d_buf++); CONV_AU(d24, k17k18); tt6 = (*d_buf++); CONV_AL(d25, k21k22); tt7 = (*d_buf++); SHIFT_U8_2; tt8 = (*d_buf++); CONV_AL(d24, k17k18); *da = tt1; da += nchannel1; CONV_AU(d25, k23k24); *da = tt2; da += nchannel1; d24 = vis_faligndata(d24, dt_4); *da = tt3; da += nchannel1; d25 = vis_faligndata(d25, dt_5); *da = tt4; da += nchannel1; CONV_AU(d24, k19k20); *da = tt5; da += nchannel1; CONV_AL(d25, k23k24); *da = tt6; da += nchannel1; d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4)); CONV_AL(d24, k19k20); *da = tt7; da += nchannel1; d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5)); CONV_AU(d25, k25); *da = tt8; da += nchannel1; (*ddst1++) = vis_fpack16_pair(out0, out1); d4 = d_4; d5 = d_5; s4++; s5++; nchannel1 = nchannel; } (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); } sa_6 = sa6 = sa6 + slb; d_a += dlb; } /* process last row - no need to load data */ ddst1 = ddst = (mlib_d64 *)(dbuf); d_buf = (dbuf - 8); da = d_a; dend = da + (dw - 1) * nchannel; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = sbuf3; sbuf3 = sbuf4; sbuf4 = sbuf5; sbuf5 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; s3 = (mlib_d64 *)sbuf3; s4 = (mlib_d64 *)sbuf4; s5 = (mlib_d64 *)sbuf5; d1 = *s1; d2 = *s2; d3 = *s3; nchannel1 = 0; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); d21 = vis_faligndata(d1, d_1); dt_1 = vis_faligndata(d_1, d1); d22 = vis_faligndata(d2, d_2); dt_2 = vis_faligndata(d_2, d2); d23 = vis_faligndata(d3, d_3); dt_3 = vis_faligndata(d_3, d3); CONV_AL(d21, k1k2); CONV_AU(d22, k7k8); CONV_AL(d23, k11k12); SHIFT_U8_1; CONV_AU(d21, k3k4); CONV_AL(d22, k7k8); CONV_AU(d23, k13k14); d21 = vis_faligndata(d21, dt_1); d22 = vis_faligndata(d22, dt_2); d23 = vis_faligndata(d23, dt_3); CONV_AL(d21, k3k4); CONV_AU(d22, k9k10); CONV_AL(d23, k13k14); d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1)); CONV_AU(d21, k5k6); d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2)); CONV_AL(d22, k9k10); d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3)); CONV_AU(d23, k15k16); ddst[0] = out0; ddst[1] = out1; ddst += 2; d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = (mlib_d64 *)(dbuf); d4 = *s4; d5 = *s5; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_4 = *(s4 + 1); d_5 = *(s5 + 1); out0 = ddst[0]; out1 = ddst[1]; ddst += 2; tt1 = (*d_buf++); CONV_AL(d4, k15k16); tt2 = (*d_buf++); CONV_AU(d5, k21k22); d24 = vis_faligndata(d4, d_4); tt3 = (*d_buf++); dt_4 = vis_faligndata(d_4, d4); d25 = vis_faligndata(d5, d_5); tt4 = (*d_buf++); dt_5 = vis_faligndata(d_5, d5); tt5 = (*d_buf++); CONV_AU(d24, k17k18); tt6 = (*d_buf++); CONV_AL(d25, k21k22); tt7 = (*d_buf++); SHIFT_U8_2; tt8 = (*d_buf++); CONV_AL(d24, k17k18); *da = tt1; da += nchannel1; CONV_AU(d25, k23k24); *da = tt2; da += nchannel1; d24 = vis_faligndata(d24, dt_4); *da = tt3; da += nchannel1; d25 = vis_faligndata(d25, dt_5); *da = tt4; da += nchannel1; CONV_AU(d24, k19k20); *da = tt5; da += nchannel1; CONV_AL(d25, k23k24); *da = tt6; da += nchannel1; d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4)); CONV_AL(d24, k19k20); *da = tt7; da += nchannel1; d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5)); CONV_AU(d25, k25); *da = tt8; da += nchannel1; (*ddst1++) = vis_fpack16_pair(out0, out1); d4 = d_4; d5 = d_5; s4++; s5++; nchannel1 = nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); } } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3, *sbuf4, *sbuf5; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf, *dbuf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d3, d4, d5; /* data */ mlib_d64 d11, d12, d13, d14, d15; /* data */ mlib_d64 d21, d22, d23, d24, d25; /* data */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; mlib_f32 k1k2, k3k4, k5k6, k7k8; mlib_f32 k9k10, k11k12, k13k14, k15k16; mlib_f32 k17k18, k19k20, k21k22, k23k24, k25; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 4; dw *= 4; dh -= 4; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; d_a = adr_dst + 2 * dlb + 8; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf4, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf5, sa3); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf5, sa4); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER_NF(16); vis_alignaddr(s1, 4); dbuf1 = dbuf; d1 = *s1; d2 = *s2; d3 = *s3; d11 = *(s1 + 1); d12 = *(s2 + 1); d13 = *(s3 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d21 = *(s1 + 2); d22 = *(s2 + 2); d23 = *(s3 + 2); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); dt_1 = vis_faligndata(d1, d11); dt_2 = vis_faligndata(d2, d12); dt_3 = vis_faligndata(d3, d13); CONV_AL(dt_1, k1k2); CONV_AU(dt_2, k7k8); CONV_AL(dt_3, k11k12); CONV_AU(d11, k3k4); CONV_AL(d12, k7k8); CONV_AU(d13, k13k14); dt_1 = vis_faligndata(d11, d21); dt_2 = vis_faligndata(d12, d22); dt_3 = vis_faligndata(d13, d23); CONV_AL(dt_1, k3k4); CONV_AU(dt_2, k9k10); CONV_AL(dt_3, k13k14); CONV_AU(d21, k5k6); CONV_AL(d22, k9k10); CONV_AU(d23, k15k16); dbuf1[0] = out0; dbuf1[1] = out1; dbuf1 += 2; d1 = d11; d2 = d12; d3 = d13; d11 = d21; d12 = d22; d13 = d23; s1++; s2++; s3++; } dbuf1 = dbuf; d4 = *s4; d5 = *s5; d14 = *(s4 + 1); d15 = *(s5 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d24 = *(s4 + 2); d25 = *(s5 + 2); out0 = dbuf1[0]; out1 = dbuf1[1]; CONV_AL(d4, k15k16); CONV_AU(d5, k21k22); dt_4 = vis_faligndata(d4, d14); dt_5 = vis_faligndata(d5, d15); CONV_AU(dt_4, k17k18); CONV_AL(dt_5, k21k22); CONV_AL(d14, k17k18); CONV_AU(d15, k23k24); dt_4 = vis_faligndata(d14, d24); dt_5 = vis_faligndata(d15, d25); CONV_AU(dt_4, k19k20); CONV_AL(dt_5, k23k24); CONV_AL(d24, k19k20); CONV_AU(d25, k25); dbuf1 += 2; (*ddst++) = vis_fpack16_pair(out0, out1); d4 = d14; d5 = d15; d14 = d24; d15 = d25; s4++; s5++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa4 = sa4 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
DIV_ALPHA_3CH(); } if (i < ww) { GET_ALPHA_3CH_2_NF(); DIV_ALPHA_3CH_NF(); } } } else { /* if (channel == 2) */ #pragma pipeloop(0) for (i = 0; i < ww; i++) { ss = *sp; a0 = vis_freg_pair(*(mlib_f32 *)(p_tbl + ap[0]), *(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 2))); a1 = vis_freg_pair(*(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 4)), *(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 6))); DIV_ALPHA(d0, vis_read_hi(ss), a0); DIV_ALPHA(d1, vis_read_lo(ss), a1); *dp = vis_fpack16_pair(d0, d1); ap += 8; sp++; dp++; } } if (dflag) {