Exemplo n.º 1
0
static force_inline __vector4
pix_multiply (__vector4 p, __vector4 a)
{
	__vector4 hi, lo, mod;
	
	__vector4 hiLow, hiHigh, modLow, modHigh, loLow, loHigh;
	__vector4 hiLowFP, hiHighFP, modLowFP, modHighFP, loLowFP, loHighFP;
	__vector4 himodLow, himodHigh, lomodLow, lomodHigh;
	
 	__vector4 zeroVector = *(__vector4*)(&zeroVectori);
 	__vector4 zeroEightVector = *(__vector4*)(&zeroEightVectori);
	__vector4 permVecLo = *(__vector4*)(&permVecLoi);
	__vector4 permVecHi = *(__vector4*)(&permVecHii);
	__vector4 permVec1Vec2 = *(__vector4*)(&permVec1Vec2i);

	/* unpack to short */
	hi  = __vmrghb(zeroVector,p);
	mod = __vmrghb(zeroVector,a);
	
	//+ What we want to do here is to multiply 8 unsigned shorts of the hi with 8 unsigned shorts of mod.
	/* Extract the hi vector into 4 Unsigned int by using 4 Lower unsigned shorts*/
	hiLow	= __vperm(hi,zeroVector, permVecLo);
	/* Extract the hi vector into 4 Unsigned int by using 4 Upper unsigned shorts*/
	hiHigh	= __vperm(hi,zeroVector, permVecHi);

	/* Extract the mod vector into 4 Unsigned int by using 4 Lower unsigned shorts*/
	modLow	= __vperm(mod,zeroVector, permVecLo);
	/* Extract the mod vector into 4 Unsigned int by using 4 Upper unsigned shorts*/
	modHigh = __vperm(mod,zeroVector, permVecHi);

	/* Convert the 4 unsigned ints to floating point by treating them as Fixed point*/ 
	hiLowFP		= __vcfux(hiLow,0);
	hiHighFP	= __vcfux(hiHigh,0);
	modLowFP	= __vcfux(modLow,0);
	modHighFP	= __vcfux(modHigh,0);

	/* Multiply the floating points */
	himodLow	= __vmaddfp(hiLowFP, modLowFP,zeroVector);
	himodHigh	= __vmaddfp(hiHighFP, modHighFP,zeroVector);
	
	/* Convert the floating points to Fixed Point with zero digits after radix point - Effectively an unsigned int*/
	himodLow	= __vctuxs(himodLow,0);
	himodHigh	= __vctuxs(himodHigh,0);

	/* Fuse the multiplication together to get the final product*/
	hi = __vperm(himodLow,himodHigh,permVec1Vec2);

	//-



	hi = __vadduhm(hi,zeroEightVector);
	
	hi = __vadduhs(hi, __vsrh(hi, __vspltish (8)));
	hi = __vsrh (hi, __vspltish (8));


	/* unpack to short */
	lo  = __vmrglb(zeroVector,p);
	mod = __vmrglb(zeroVector,a);
	
	//+ Comments from few lines above applicable here.
	loLow	= __vperm(lo,zeroVector,permVecLo);
	loHigh	= __vperm(lo,zeroVector, permVecHi); 

	modLow	= __vperm(mod,zeroVector, permVecLo);
	modHigh = __vperm(mod,zeroVector, permVecHi);

	loLowFP		= __vcfux(loLow,0);
	loHighFP	= __vcfux(loHigh,0);
	modLowFP	= __vcfux(modLow,0);
	modHighFP	= __vcfux(modHigh,0);

	lomodLow	= __vmaddfp(loLowFP, modLowFP,zeroVector);
	lomodHigh	= __vmaddfp(loHighFP, modHighFP,zeroVector);
	
	lomodLow	= __vctuxs(lomodLow,0);
	lomodHigh	= __vctuxs(lomodHigh,0);

	lo = __vperm(lomodLow,lomodHigh,permVec1Vec2);
	//-

	lo = __vadduhm(lo,zeroEightVector);
	
	lo = __vadduhs (lo, __vsrh (lo, __vspltish (8)));
	lo = __vsrh (lo, __vspltish (8));

	return __vpkuhus (hi, lo);
}
Exemplo n.º 2
0
void biweight_h264_WxH_vmx(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
                               int weightd, int weights, int offset, int w, int h)
{
    int y, dst_aligned, src_aligned;
    //vec_u8 vsrc, vdst;
    //vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
    //vec_u16 vlog2_denom;

	__vector4 vsrc, vdst;
    __vector4 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
    __vector4 vlog2_denom;

    DECLARE_ALIGNED(16, int32_t, temp)[4];
    //LOAD_ZERO;

    offset = ((offset + 1) | 1) << log2_denom;
    temp[0] = log2_denom+1;
    temp[1] = weights;
    temp[2] = weightd;
    temp[3] = offset;

    vtemp = __lvx(temp, 0);
/*   
	vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
    vweights = vec_splat(vtemp, 3);
    vweightd = vec_splat(vtemp, 5);
    voffset = vec_splat(vtemp, 7);
*/
	vlog2_denom = __vsplth(vtemp,1);
	vweights = __vsplth(vtemp,3);
	vweightd = __vsplth(vtemp,5);
	voffset = __vsplth(vtemp,7);

	dst_aligned = !((unsigned long)dst & 0xf);
    src_aligned = !((unsigned long)src & 0xf);

    for (y=0; y<h; y++) {
        vdst = __lvx(dst, 0);
        vsrc = __lvx(src, 0);

        v0 = __vmrghb(__vzero(), vdst);
        v1 = __vmrglb(__vzero(), vdst);
        v2 = __vmrghb(__vzero(), vsrc);
        v3 = __vmrglb(__vzero(), vsrc);

        if (w == 8) {
            if (src_aligned)
                v3 = v2;
            else
                v2 = v3;
        }

        if (w == 16 || dst_aligned) {
            v0 = vec_mladd(v0, vweightd, __vzero());
            v2 = vec_mladd(v2, vweights, __vzero());

            v0 = __vaddshs (v0, voffset);
            v0 = __vaddshs(v0, v2);
            v0 = __vsrah(v0, vlog2_denom);
        }
        if (w == 16 || !dst_aligned) {
            v1 = vec_mladd(v1, vweightd, __vzero());
            v3 = vec_mladd(v3, vweights, __vzero());

            v1 = __vaddshs(v1, voffset);
            v1 = __vaddshs(v1, v3);
            v1 = __vsrah(v1, vlog2_denom);
        }
        vdst = __vpkshus(v0, v1);
        __stvx(vdst, dst, 0);

        dst += stride;
        src += stride;
    }
}