static force_inline __vector4 pix_multiply (__vector4 p, __vector4 a) { __vector4 hi, lo, mod; __vector4 hiLow, hiHigh, modLow, modHigh, loLow, loHigh; __vector4 hiLowFP, hiHighFP, modLowFP, modHighFP, loLowFP, loHighFP; __vector4 himodLow, himodHigh, lomodLow, lomodHigh; __vector4 zeroVector = *(__vector4*)(&zeroVectori); __vector4 zeroEightVector = *(__vector4*)(&zeroEightVectori); __vector4 permVecLo = *(__vector4*)(&permVecLoi); __vector4 permVecHi = *(__vector4*)(&permVecHii); __vector4 permVec1Vec2 = *(__vector4*)(&permVec1Vec2i); /* unpack to short */ hi = __vmrghb(zeroVector,p); mod = __vmrghb(zeroVector,a); //+ What we want to do here is to multiply 8 unsigned shorts of the hi with 8 unsigned shorts of mod. /* Extract the hi vector into 4 Unsigned int by using 4 Lower unsigned shorts*/ hiLow = __vperm(hi,zeroVector, permVecLo); /* Extract the hi vector into 4 Unsigned int by using 4 Upper unsigned shorts*/ hiHigh = __vperm(hi,zeroVector, permVecHi); /* Extract the mod vector into 4 Unsigned int by using 4 Lower unsigned shorts*/ modLow = __vperm(mod,zeroVector, permVecLo); /* Extract the mod vector into 4 Unsigned int by using 4 Upper unsigned shorts*/ modHigh = __vperm(mod,zeroVector, permVecHi); /* Convert the 4 unsigned ints to floating point by treating them as Fixed point*/ hiLowFP = __vcfux(hiLow,0); hiHighFP = __vcfux(hiHigh,0); modLowFP = __vcfux(modLow,0); modHighFP = __vcfux(modHigh,0); /* Multiply the floating points */ himodLow = __vmaddfp(hiLowFP, modLowFP,zeroVector); himodHigh = __vmaddfp(hiHighFP, modHighFP,zeroVector); /* Convert the floating points to Fixed Point with zero digits after radix point - Effectively an unsigned int*/ himodLow = __vctuxs(himodLow,0); himodHigh = __vctuxs(himodHigh,0); /* Fuse the multiplication together to get the final product*/ hi = __vperm(himodLow,himodHigh,permVec1Vec2); //- hi = __vadduhm(hi,zeroEightVector); hi = __vadduhs(hi, __vsrh(hi, __vspltish (8))); hi = __vsrh (hi, __vspltish (8)); /* unpack to short */ lo = __vmrglb(zeroVector,p); mod = __vmrglb(zeroVector,a); //+ Comments from few lines above applicable here. loLow = __vperm(lo,zeroVector,permVecLo); loHigh = __vperm(lo,zeroVector, permVecHi); modLow = __vperm(mod,zeroVector, permVecLo); modHigh = __vperm(mod,zeroVector, permVecHi); loLowFP = __vcfux(loLow,0); loHighFP = __vcfux(loHigh,0); modLowFP = __vcfux(modLow,0); modHighFP = __vcfux(modHigh,0); lomodLow = __vmaddfp(loLowFP, modLowFP,zeroVector); lomodHigh = __vmaddfp(loHighFP, modHighFP,zeroVector); lomodLow = __vctuxs(lomodLow,0); lomodHigh = __vctuxs(lomodHigh,0); lo = __vperm(lomodLow,lomodHigh,permVec1Vec2); //- lo = __vadduhm(lo,zeroEightVector); lo = __vadduhs (lo, __vsrh (lo, __vspltish (8))); lo = __vsrh (lo, __vspltish (8)); return __vpkuhus (hi, lo); }
{((8 - x) * (8 - y)), (( x) * (8 - y)), ((8 - x) * ( y)), (( x) * ( y))}; register int i; __vector4 fperm; const __vector4 vABCD = __lvx(ABCD,0); const __vector4 vA = __vsplth(vABCD, 1); const __vector4 vB = __vsplth(vABCD, 3); const __vector4 vC = __vsplth(vABCD, 5); const __vector4 vD = __vsplth(vABCD, 7); //LOAD_ZERO; __vector4 zerov = __vzero(); const __vector4 v32ss = __vslh(__vspltish(1),__vspltish(5)); const __vector4 v6us = __vspltish(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; __vector4 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; __vector4 vsrc0uc, vsrc1uc; __vector4 vsrc0ssH, vsrc1ssH; __vector4 vsrcCuc, vsrc2uc, vsrc3uc; __vector4 vsrc2ssH, vsrc3ssH, psum; __vector4 vdst, ppsum, vfdst, fsum; if (((unsigned long)dst) % 16 == 0) { fperm = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,