void ADD_SUFF(ByteGrayToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; for (i = 0; i < width; i++) { x = src[i]; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == width && dstScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; dst_end = dst + width; while (((mlib_s32)src & 3) && dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { ff = *(mlib_f32*)src; d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); src += 4; } while (dst < dst_end) { x = *src++; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
void ADD_SUFF(ByteGrayToIntArgbScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 d0, d1, d2, d3, dd; mlib_f32 ff, aa = vis_fones(); mlib_s32 i, j, x; if (width < 16) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); for (i = 0; i < width; i++) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; dst[i] = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } return; } vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { LOAD_NEXT_U8(dd, src + ((tmpsxloc + 3*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + 2*sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc + sxinc) >> shift)); LOAD_NEXT_U8(dd, src + ((tmpsxloc ) >> shift)); tmpsxloc += 4*sxinc; ff = vis_read_hi(dd); d0 = vis_fpmerge(aa, ff); d1 = vis_fpmerge(ff, ff); d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1)); d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1)); ((mlib_f32*)dst)[0] = vis_read_hi(d2); ((mlib_f32*)dst)[1] = vis_read_lo(d2); ((mlib_f32*)dst)[2] = vis_read_hi(d3); ((mlib_f32*)dst)[3] = vis_read_lo(d3); } while (dst < dst_end) { x = src[tmpsxloc >> shift]; tmpsxloc += sxinc; *dst++ = Gray2Argb(x); } PTR_ADD(dstBase, dstScan); syloc += syinc; } }
mlib_status __mlib_VideoH263OverlappedMC_S16_U8( mlib_s16 mc_block[64], const mlib_u8 *ref_frame, mlib_s32 mch, mlib_s32 mcv, mlib_s32 mah, mlib_s32 mav, mlib_s32 mbh, mlib_s32 mbv, mlib_s32 mlh, mlib_s32 mlv, mlib_s32 mrh, mlib_s32 mrv, mlib_s32 ref_stride) { mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9; mlib_d64 d10, d11, d12, d13, d14, d15; mlib_d64 tmp1, tmp2, tmp3; mlib_d64 dmask = vis_fexpand(vis_fones()); mlib_d64 denom = vis_fandnot(dmask, vis_fpadd16(dmask, dmask)); mlib_f32 reg_H0_00, reg_H0_01, reg_H0_10, reg_H0_20, reg_H0_21; mlib_f32 reg_H1_00, reg_H1_10, reg_H1_11, reg_H1_20, reg_H2_00; mlib_f32 reg_H2_01, reg_H2_10, reg_H2_11; mlib_f32 frnd; mlib_d64 *dp, *sd; const mlib_u8 *sp1, *sp2, *sp3, *sp4, *sp5; mlib_s32 ref_stride2 = ref_stride << 1, off; sp1 = (ref_frame + mch + mcv * ref_stride); sp2 = (ref_frame + mah + mav * ref_stride); sp3 = (ref_frame + mlh + mlv * ref_stride); sp4 = (ref_frame + mrh + 8 + mrv * ref_stride); sp5 = (ref_frame + mbh + (mbv + 8) * ref_stride); dp = (mlib_d64 *)mc_block; reg_H0_00 = vis_to_float(0x40505050); reg_H0_01 = vis_to_float(0x50505040); reg_H0_10 = vis_to_float(0x50505050); reg_H0_20 = vis_to_float(0x50506060); reg_H0_21 = vis_to_float(0x60605050); frnd = vis_to_float(0x20202020); /* * central */ sd = (mlib_d64 *)vis_alignaddr((void *)sp1, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d0, tmp1, reg_H0_00); ACCSET(d1, tmp2, reg_H0_01); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d2, tmp1, reg_H0_10); ACCSET(d3, tmp2, reg_H0_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d4, tmp1, reg_H0_20); ACCSET(d5, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d6, tmp1, reg_H0_20); ACCSET(d7, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d8, tmp1, reg_H0_20); ACCSET(d9, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d10, tmp1, reg_H0_20); ACCSET(d11, tmp2, reg_H0_21); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d12, tmp1, reg_H0_10); ACCSET(d13, tmp2, reg_H0_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCSET(d14, tmp1, reg_H0_00); ACCSET(d15, tmp2, reg_H0_01); /* * left */ reg_H2_00 = vis_to_float(0x20101010); reg_H2_01 = vis_to_float(0x10101020); reg_H2_10 = vis_to_float(0x20201010); reg_H2_11 = vis_to_float(0x10102020); off = (mlib_addr)sp3 & 7; sd = (mlib_d64 *)((mlib_u8 *)sp3 - off); vis_write_bmask(0x11111111 * off + 0x01234567, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d0, tmp1, reg_H2_00); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d2, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d4, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d6, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d8, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d10, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d12, tmp1, reg_H2_10); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = vis_ld_d64_nf(sd + 1); tmp1 = vis_bshuffle(tmp1, tmp2); ACCADD(d14, tmp1, reg_H2_00); /* * right */ sd = (mlib_d64 *)vis_alignaddr((void *)sp4, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d1, tmp1, reg_H2_01); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d3, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d5, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d7, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d9, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d11, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d13, tmp1, reg_H2_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = vis_ld_d64_nf(sd + 1); tmp1 = vis_faligndata(tmp1, tmp2); ACCADD(d15, tmp1, reg_H2_01); /* * above */ reg_H1_10 = vis_to_float(0x10102020); reg_H1_11 = vis_to_float(0x20201010); reg_H1_20 = vis_to_float(0x10101010); off = (mlib_addr)sp2 & 7; sd = (mlib_d64 *)((mlib_u8 *)sp2 - off); vis_write_bmask(0x11111111 * off + 0x01234567, 0); reg_H1_00 = vis_to_float(0x20202020); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[0], d0, tmp1, reg_H1_00); ACCPUT(dp[1], d1, tmp2, reg_H1_00); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[2], d2, tmp1, reg_H1_10); ACCPUT(dp[3], d3, tmp2, reg_H1_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[4], d4, tmp1, reg_H1_20); ACCPUT(dp[5], d5, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_bshuffle(tmp1, tmp2); tmp2 = vis_bshuffle(tmp2, tmp3); ACCPUT(dp[6], d6, tmp1, reg_H1_20); ACCPUT(dp[7], d7, tmp2, reg_H1_20); /* * below */ sd = (mlib_d64 *)vis_alignaddr((void *)sp5, 0); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[8], d8, tmp1, reg_H1_20); ACCPUT(dp[9], d9, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[10], d10, tmp1, reg_H1_20); ACCPUT(dp[11], d11, tmp2, reg_H1_20); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = sd[2]; tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[12], d12, tmp1, reg_H1_10); ACCPUT(dp[13], d13, tmp2, reg_H1_11); sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2); tmp1 = sd[0]; tmp2 = sd[1]; tmp3 = vis_ld_d64_nf(sd + 2); tmp1 = vis_faligndata(tmp1, tmp2); tmp2 = vis_faligndata(tmp2, tmp3); ACCPUT(dp[14], d14, tmp1, reg_H1_00); ACCPUT(dp[15], d15, tmp2, reg_H1_00); return (MLIB_SUCCESS); }