Ejemplo n.º 1
0
void ADD_SUFF(ByteGrayToIntArgbConvert)(BLIT_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 d0, d1, d2, d3;
    mlib_f32 ff, aa = vis_fones();
    mlib_s32 i, j, x;

    if (width < 8) {
        for (j = 0; j < height; j++) {
            mlib_u8  *src = srcBase;
            mlib_s32 *dst = dstBase;

            for (i = 0; i < width; i++) {
                x = src[i];
                dst[i] = Gray2Argb(x);
            }

            PTR_ADD(dstBase, dstScan);
            PTR_ADD(srcBase, srcScan);
        }
        return;
    }

    if (srcScan == width && dstScan == 4*width) {
        width *= height;
        height = 1;
    }

    for (j = 0; j < height; j++) {
        mlib_u8  *src = srcBase;
        mlib_s32 *dst = dstBase;
        mlib_s32 *dst_end;

        dst_end = dst + width;

        while (((mlib_s32)src & 3) && dst < dst_end) {
            x = *src++;
            *dst++ = Gray2Argb(x);
        }

#pragma pipeloop(0)
        for (; dst <= (dst_end - 4); dst += 4) {
            ff = *(mlib_f32*)src;
            d0 = vis_fpmerge(aa, ff);
            d1 = vis_fpmerge(ff, ff);
            d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1));
            d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1));
            ((mlib_f32*)dst)[0] = vis_read_hi(d2);
            ((mlib_f32*)dst)[1] = vis_read_lo(d2);
            ((mlib_f32*)dst)[2] = vis_read_hi(d3);
            ((mlib_f32*)dst)[3] = vis_read_lo(d3);
            src += 4;
        }

        while (dst < dst_end) {
            x = *src++;
            *dst++ = Gray2Argb(x);
        }

        PTR_ADD(dstBase, dstScan);
        PTR_ADD(srcBase, srcScan);
    }
}
Ejemplo n.º 2
0
void ADD_SUFF(ByteGrayToIntArgbScaleConvert)(SCALE_PARAMS)
{
    mlib_s32 dstScan = pDstInfo->scanStride;
    mlib_s32 srcScan = pSrcInfo->scanStride;
    mlib_d64 d0, d1, d2, d3, dd;
    mlib_f32 ff, aa = vis_fones();
    mlib_s32 i, j, x;

    if (width < 16) {
        for (j = 0; j < height; j++) {
            mlib_u8  *src = srcBase;
            mlib_s32 *dst = dstBase;
            mlib_s32 tmpsxloc = sxloc;

            PTR_ADD(src, (syloc >> shift) * srcScan);

            for (i = 0; i < width; i++) {
                x = src[tmpsxloc >> shift];
                tmpsxloc += sxinc;
                dst[i] = Gray2Argb(x);
            }

            PTR_ADD(dstBase, dstScan);
            syloc += syinc;
        }
        return;
    }

    vis_alignaddr(NULL, 7);

    for (j = 0; j < height; j++) {
        mlib_u8  *src = srcBase;
        mlib_s32 *dst = dstBase;
        mlib_s32 *dst_end;
        mlib_s32 tmpsxloc = sxloc;

        PTR_ADD(src, (syloc >> shift) * srcScan);

        dst_end = dst + width;

#pragma pipeloop(0)
        for (; dst <= (dst_end - 4); dst += 4) {
            LOAD_NEXT_U8(dd, src + ((tmpsxloc + 3*sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc + 2*sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc +   sxinc) >> shift));
            LOAD_NEXT_U8(dd, src + ((tmpsxloc          ) >> shift));
            tmpsxloc += 4*sxinc;
            ff = vis_read_hi(dd);
            d0 = vis_fpmerge(aa, ff);
            d1 = vis_fpmerge(ff, ff);
            d2 = vis_fpmerge(vis_read_hi(d0), vis_read_hi(d1));
            d3 = vis_fpmerge(vis_read_lo(d0), vis_read_lo(d1));
            ((mlib_f32*)dst)[0] = vis_read_hi(d2);
            ((mlib_f32*)dst)[1] = vis_read_lo(d2);
            ((mlib_f32*)dst)[2] = vis_read_hi(d3);
            ((mlib_f32*)dst)[3] = vis_read_lo(d3);
        }

        while (dst < dst_end) {
            x = src[tmpsxloc >> shift];
            tmpsxloc += sxinc;
            *dst++ = Gray2Argb(x);
        }

        PTR_ADD(dstBase, dstScan);
        syloc += syinc;
    }
}
mlib_status
__mlib_VideoH263OverlappedMC_S16_U8(
	mlib_s16 mc_block[64],
	const mlib_u8 *ref_frame,
	mlib_s32 mch,
	mlib_s32 mcv,
	mlib_s32 mah,
	mlib_s32 mav,
	mlib_s32 mbh,
	mlib_s32 mbv,
	mlib_s32 mlh,
	mlib_s32 mlv,
	mlib_s32 mrh,
	mlib_s32 mrv,
	mlib_s32 ref_stride)
{
	mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9;
	mlib_d64 d10, d11, d12, d13, d14, d15;
	mlib_d64 tmp1, tmp2, tmp3;
	mlib_d64 dmask = vis_fexpand(vis_fones());
	mlib_d64 denom = vis_fandnot(dmask, vis_fpadd16(dmask, dmask));
	mlib_f32 reg_H0_00, reg_H0_01, reg_H0_10, reg_H0_20, reg_H0_21;
	mlib_f32 reg_H1_00, reg_H1_10, reg_H1_11, reg_H1_20, reg_H2_00;
	mlib_f32 reg_H2_01, reg_H2_10, reg_H2_11;
	mlib_f32 frnd;
	mlib_d64 *dp, *sd;
	const mlib_u8 *sp1, *sp2, *sp3, *sp4, *sp5;
	mlib_s32 ref_stride2 = ref_stride << 1, off;

	sp1 = (ref_frame + mch + mcv * ref_stride);
	sp2 = (ref_frame + mah + mav * ref_stride);
	sp3 = (ref_frame + mlh + mlv * ref_stride);
	sp4 = (ref_frame + mrh + 8 + mrv * ref_stride);
	sp5 = (ref_frame + mbh + (mbv + 8) * ref_stride);
	dp = (mlib_d64 *)mc_block;

	reg_H0_00 = vis_to_float(0x40505050);
	reg_H0_01 = vis_to_float(0x50505040);
	reg_H0_10 = vis_to_float(0x50505050);
	reg_H0_20 = vis_to_float(0x50506060);
	reg_H0_21 = vis_to_float(0x60605050);

	frnd = vis_to_float(0x20202020);

/*
 * central
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp1, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d0, tmp1, reg_H0_00);
	ACCSET(d1, tmp2, reg_H0_01);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d2, tmp1, reg_H0_10);
	ACCSET(d3, tmp2, reg_H0_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d4, tmp1, reg_H0_20);
	ACCSET(d5, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d6, tmp1, reg_H0_20);
	ACCSET(d7, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d8, tmp1, reg_H0_20);
	ACCSET(d9, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d10, tmp1, reg_H0_20);
	ACCSET(d11, tmp2, reg_H0_21);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d12, tmp1, reg_H0_10);
	ACCSET(d13, tmp2, reg_H0_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCSET(d14, tmp1, reg_H0_00);
	ACCSET(d15, tmp2, reg_H0_01);

/*
 * left
 */
	reg_H2_00 = vis_to_float(0x20101010);
	reg_H2_01 = vis_to_float(0x10101020);
	reg_H2_10 = vis_to_float(0x20201010);
	reg_H2_11 = vis_to_float(0x10102020);

	off = (mlib_addr)sp3 & 7;
	sd = (mlib_d64 *)((mlib_u8 *)sp3 - off);
	vis_write_bmask(0x11111111 * off + 0x01234567, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d0, tmp1, reg_H2_00);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d2, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d4, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d6, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d8, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d10, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d12, tmp1, reg_H2_10);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = vis_ld_d64_nf(sd + 1);
	tmp1 = vis_bshuffle(tmp1, tmp2);
	ACCADD(d14, tmp1, reg_H2_00);

/*
 * right
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp4, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d1, tmp1, reg_H2_01);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d3, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d5, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d7, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d9, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d11, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d13, tmp1, reg_H2_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = vis_ld_d64_nf(sd + 1);
	tmp1 = vis_faligndata(tmp1, tmp2);
	ACCADD(d15, tmp1, reg_H2_01);

/*
 * above
 */
	reg_H1_10 = vis_to_float(0x10102020);
	reg_H1_11 = vis_to_float(0x20201010);
	reg_H1_20 = vis_to_float(0x10101010);

	off = (mlib_addr)sp2 & 7;
	sd = (mlib_d64 *)((mlib_u8 *)sp2 - off);
	vis_write_bmask(0x11111111 * off + 0x01234567, 0);
	reg_H1_00 = vis_to_float(0x20202020);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[0], d0, tmp1, reg_H1_00);
	ACCPUT(dp[1], d1, tmp2, reg_H1_00);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[2], d2, tmp1, reg_H1_10);
	ACCPUT(dp[3], d3, tmp2, reg_H1_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[4], d4, tmp1, reg_H1_20);
	ACCPUT(dp[5], d5, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_bshuffle(tmp1, tmp2);
	tmp2 = vis_bshuffle(tmp2, tmp3);
	ACCPUT(dp[6], d6, tmp1, reg_H1_20);
	ACCPUT(dp[7], d7, tmp2, reg_H1_20);

/*
 * below
 */
	sd = (mlib_d64 *)vis_alignaddr((void *)sp5, 0);
	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[8], d8, tmp1, reg_H1_20);
	ACCPUT(dp[9], d9, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[10], d10, tmp1, reg_H1_20);
	ACCPUT(dp[11], d11, tmp2, reg_H1_20);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = sd[2];
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[12], d12, tmp1, reg_H1_10);
	ACCPUT(dp[13], d13, tmp2, reg_H1_11);
	sd = (mlib_d64 *)((mlib_u8 *)sd + ref_stride2);

	tmp1 = sd[0];
	tmp2 = sd[1];
	tmp3 = vis_ld_d64_nf(sd + 2);
	tmp1 = vis_faligndata(tmp1, tmp2);
	tmp2 = vis_faligndata(tmp2, tmp3);
	ACCPUT(dp[14], d14, tmp1, reg_H1_00);
	ACCPUT(dp[15], d15, tmp2, reg_H1_00);

	return (MLIB_SUCCESS);
}