static force_inline __vector4 pix_multiply (__vector4 p, __vector4 a) { __vector4 hi, lo, mod; __vector4 hiLow, hiHigh, modLow, modHigh, loLow, loHigh; __vector4 hiLowFP, hiHighFP, modLowFP, modHighFP, loLowFP, loHighFP; __vector4 himodLow, himodHigh, lomodLow, lomodHigh; __vector4 zeroVector = *(__vector4*)(&zeroVectori); __vector4 zeroEightVector = *(__vector4*)(&zeroEightVectori); __vector4 permVecLo = *(__vector4*)(&permVecLoi); __vector4 permVecHi = *(__vector4*)(&permVecHii); __vector4 permVec1Vec2 = *(__vector4*)(&permVec1Vec2i); /* unpack to short */ hi = __vmrghb(zeroVector,p); mod = __vmrghb(zeroVector,a); //+ What we want to do here is to multiply 8 unsigned shorts of the hi with 8 unsigned shorts of mod. /* Extract the hi vector into 4 Unsigned int by using 4 Lower unsigned shorts*/ hiLow = __vperm(hi,zeroVector, permVecLo); /* Extract the hi vector into 4 Unsigned int by using 4 Upper unsigned shorts*/ hiHigh = __vperm(hi,zeroVector, permVecHi); /* Extract the mod vector into 4 Unsigned int by using 4 Lower unsigned shorts*/ modLow = __vperm(mod,zeroVector, permVecLo); /* Extract the mod vector into 4 Unsigned int by using 4 Upper unsigned shorts*/ modHigh = __vperm(mod,zeroVector, permVecHi); /* Convert the 4 unsigned ints to floating point by treating them as Fixed point*/ hiLowFP = __vcfux(hiLow,0); hiHighFP = __vcfux(hiHigh,0); modLowFP = __vcfux(modLow,0); modHighFP = __vcfux(modHigh,0); /* Multiply the floating points */ himodLow = __vmaddfp(hiLowFP, modLowFP,zeroVector); himodHigh = __vmaddfp(hiHighFP, modHighFP,zeroVector); /* Convert the floating points to Fixed Point with zero digits after radix point - Effectively an unsigned int*/ himodLow = __vctuxs(himodLow,0); himodHigh = __vctuxs(himodHigh,0); /* Fuse the multiplication together to get the final product*/ hi = __vperm(himodLow,himodHigh,permVec1Vec2); //- hi = __vadduhm(hi,zeroEightVector); hi = __vadduhs(hi, __vsrh(hi, __vspltish (8))); hi = __vsrh (hi, __vspltish (8)); /* unpack to short */ lo = __vmrglb(zeroVector,p); mod = __vmrglb(zeroVector,a); //+ Comments from few lines above applicable here. loLow = __vperm(lo,zeroVector,permVecLo); loHigh = __vperm(lo,zeroVector, permVecHi); modLow = __vperm(mod,zeroVector, permVecLo); modHigh = __vperm(mod,zeroVector, permVecHi); loLowFP = __vcfux(loLow,0); loHighFP = __vcfux(loHigh,0); modLowFP = __vcfux(modLow,0); modHighFP = __vcfux(modHigh,0); lomodLow = __vmaddfp(loLowFP, modLowFP,zeroVector); lomodHigh = __vmaddfp(loHighFP, modHighFP,zeroVector); lomodLow = __vctuxs(lomodLow,0); lomodHigh = __vctuxs(lomodHigh,0); lo = __vperm(lomodLow,lomodHigh,permVec1Vec2); //- lo = __vadduhm(lo,zeroEightVector); lo = __vadduhs (lo, __vsrh (lo, __vspltish (8))); lo = __vsrh (lo, __vspltish (8)); return __vpkuhus (hi, lo); }
void biweight_h264_WxH_vmx(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) { int y, dst_aligned, src_aligned; //vec_u8 vsrc, vdst; //vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3; //vec_u16 vlog2_denom; __vector4 vsrc, vdst; __vector4 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3; __vector4 vlog2_denom; DECLARE_ALIGNED(16, int32_t, temp)[4]; //LOAD_ZERO; offset = ((offset + 1) | 1) << log2_denom; temp[0] = log2_denom+1; temp[1] = weights; temp[2] = weightd; temp[3] = offset; vtemp = __lvx(temp, 0); /* vlog2_denom = (vec_u16)vec_splat(vtemp, 1); vweights = vec_splat(vtemp, 3); vweightd = vec_splat(vtemp, 5); voffset = vec_splat(vtemp, 7); */ vlog2_denom = __vsplth(vtemp,1); vweights = __vsplth(vtemp,3); vweightd = __vsplth(vtemp,5); voffset = __vsplth(vtemp,7); dst_aligned = !((unsigned long)dst & 0xf); src_aligned = !((unsigned long)src & 0xf); for (y=0; y<h; y++) { vdst = __lvx(dst, 0); vsrc = __lvx(src, 0); v0 = __vmrghb(__vzero(), vdst); v1 = __vmrglb(__vzero(), vdst); v2 = __vmrghb(__vzero(), vsrc); v3 = __vmrglb(__vzero(), vsrc); if (w == 8) { if (src_aligned) v3 = v2; else v2 = v3; } if (w == 16 || dst_aligned) { v0 = vec_mladd(v0, vweightd, __vzero()); v2 = vec_mladd(v2, vweights, __vzero()); v0 = __vaddshs (v0, voffset); v0 = __vaddshs(v0, v2); v0 = __vsrah(v0, vlog2_denom); } if (w == 16 || !dst_aligned) { v1 = vec_mladd(v1, vweightd, __vzero()); v3 = vec_mladd(v3, vweights, __vzero()); v1 = __vaddshs(v1, voffset); v1 = __vaddshs(v1, v3); v1 = __vsrah(v1, vlog2_denom); } vdst = __vpkshus(v0, v1); __stvx(vdst, dst, 0); dst += stride; src += stride; } }