Example #1
0
static force_inline __vector4
pix_multiply (__vector4 p, __vector4 a)
{
	__vector4 hi, lo, mod;
	
	__vector4 hiLow, hiHigh, modLow, modHigh, loLow, loHigh;
	__vector4 hiLowFP, hiHighFP, modLowFP, modHighFP, loLowFP, loHighFP;
	__vector4 himodLow, himodHigh, lomodLow, lomodHigh;
	
 	__vector4 zeroVector = *(__vector4*)(&zeroVectori);
 	__vector4 zeroEightVector = *(__vector4*)(&zeroEightVectori);
	__vector4 permVecLo = *(__vector4*)(&permVecLoi);
	__vector4 permVecHi = *(__vector4*)(&permVecHii);
	__vector4 permVec1Vec2 = *(__vector4*)(&permVec1Vec2i);

	/* unpack to short */
	hi  = __vmrghb(zeroVector,p);
	mod = __vmrghb(zeroVector,a);
	
	//+ What we want to do here is to multiply 8 unsigned shorts of the hi with 8 unsigned shorts of mod.
	/* Extract the hi vector into 4 Unsigned int by using 4 Lower unsigned shorts*/
	hiLow	= __vperm(hi,zeroVector, permVecLo);
	/* Extract the hi vector into 4 Unsigned int by using 4 Upper unsigned shorts*/
	hiHigh	= __vperm(hi,zeroVector, permVecHi);

	/* Extract the mod vector into 4 Unsigned int by using 4 Lower unsigned shorts*/
	modLow	= __vperm(mod,zeroVector, permVecLo);
	/* Extract the mod vector into 4 Unsigned int by using 4 Upper unsigned shorts*/
	modHigh = __vperm(mod,zeroVector, permVecHi);

	/* Convert the 4 unsigned ints to floating point by treating them as Fixed point*/ 
	hiLowFP		= __vcfux(hiLow,0);
	hiHighFP	= __vcfux(hiHigh,0);
	modLowFP	= __vcfux(modLow,0);
	modHighFP	= __vcfux(modHigh,0);

	/* Multiply the floating points */
	himodLow	= __vmaddfp(hiLowFP, modLowFP,zeroVector);
	himodHigh	= __vmaddfp(hiHighFP, modHighFP,zeroVector);
	
	/* Convert the floating points to Fixed Point with zero digits after radix point - Effectively an unsigned int*/
	himodLow	= __vctuxs(himodLow,0);
	himodHigh	= __vctuxs(himodHigh,0);

	/* Fuse the multiplication together to get the final product*/
	hi = __vperm(himodLow,himodHigh,permVec1Vec2);

	//-



	hi = __vadduhm(hi,zeroEightVector);
	
	hi = __vadduhs(hi, __vsrh(hi, __vspltish (8)));
	hi = __vsrh (hi, __vspltish (8));


	/* unpack to short */
	lo  = __vmrglb(zeroVector,p);
	mod = __vmrglb(zeroVector,a);
	
	//+ Comments from few lines above applicable here.
	loLow	= __vperm(lo,zeroVector,permVecLo);
	loHigh	= __vperm(lo,zeroVector, permVecHi); 

	modLow	= __vperm(mod,zeroVector, permVecLo);
	modHigh = __vperm(mod,zeroVector, permVecHi);

	loLowFP		= __vcfux(loLow,0);
	loHighFP	= __vcfux(loHigh,0);
	modLowFP	= __vcfux(modLow,0);
	modHighFP	= __vcfux(modHigh,0);

	lomodLow	= __vmaddfp(loLowFP, modLowFP,zeroVector);
	lomodHigh	= __vmaddfp(loHighFP, modHighFP,zeroVector);
	
	lomodLow	= __vctuxs(lomodLow,0);
	lomodHigh	= __vctuxs(lomodHigh,0);

	lo = __vperm(lomodLow,lomodHigh,permVec1Vec2);
	//-

	lo = __vadduhm(lo,zeroEightVector);
	
	lo = __vadduhs (lo, __vsrh (lo, __vspltish (8)));
	lo = __vsrh (lo, __vspltish (8));

	return __vpkuhus (hi, lo);
}
Example #2
0
                        {((8 - x) * (8 - y)),
                         ((    x) * (8 - y)),
                         ((8 - x) * (    y)),
                         ((    x) * (    y))};
    register int i;
    __vector4 fperm;
    const __vector4 vABCD = __lvx(ABCD,0);
    const __vector4 vA = __vsplth(vABCD, 1);
    const __vector4 vB = __vsplth(vABCD, 3);
    const __vector4 vC = __vsplth(vABCD, 5);
    const __vector4 vD = __vsplth(vABCD, 7);
    
	//LOAD_ZERO;
	__vector4 zerov  = __vzero();
    
	const __vector4 v32ss = __vslh(__vspltish(1),__vspltish(5));
    const __vector4 v6us = __vspltish(6);
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;

    __vector4 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
    __vector4 vsrc0uc, vsrc1uc;
    __vector4 vsrc0ssH, vsrc1ssH;
    __vector4 vsrcCuc, vsrc2uc, vsrc3uc;
    __vector4 vsrc2ssH, vsrc3ssH, psum;
    __vector4 vdst, ppsum, vfdst, fsum;
	
    if (((unsigned long)dst) % 16 == 0) {
        fperm = {
			0x10, 0x11, 0x12, 0x13,
			0x14, 0x15, 0x16, 0x17,