void pix_movement :: processYUVAltivec(imageStruct &image) { if (image.xsize*image.ysize != buffer.xsize*buffer.ysize){ buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(buffer.xsize*buffer.ysize*2); } int pixsize = image.ysize * image.xsize/8; union{ signed short c[8]; vector signed short v; }shortBuffer; union{ unsigned short c[8]; vector unsigned short v; }ushortBuffer; int i; vector signed short thresh; shortBuffer.c[0] = threshold; thresh = shortBuffer.v; thresh = (vector signed short)vec_splat(thresh,0); vector unsigned char *rp = (vector unsigned char *) image.data; // read pointer vector unsigned char *wp = (vector unsigned char *) buffer.data; // write pointer to the copy vector unsigned char grey0,grey1; vector unsigned char one = vec_splat_u8(1); vector unsigned short Y0,Ywp0,hiImage0,loImage0; vector unsigned short Y1,Ywp1,hiImage1,loImage1; vector unsigned short UVwp0,UVwp1; vector signed short temp0,temp1; ushortBuffer.c[0]=127; vector unsigned short UV0= (vector unsigned short)vec_splat(ushortBuffer.v, 0); vector unsigned short UV1= (vector unsigned short)vec_splat(ushortBuffer.v, 0); #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 0, 256 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); #endif int j = 16; pixsize/=2; for (i=0; i < pixsize; i++) { # ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( j, 0, j * 16 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); vec_dst( rp+16, prefetchSize, 2 ); vec_dst( wp+16, prefetchSize, 3 ); # endif grey0 = rp[0]; grey1 = rp[1]; // rp[Y0]=255*(abs(grey0-*wp)>thresh); // UV0= (vector unsigned short)vec_mule(grey0,one); Y0 = (vector unsigned short)vec_mulo(grey0,one); // UV1= (vector unsigned short)vec_mule(grey1,one); Y1 = (vector unsigned short)vec_mulo(grey1,one); //wp is actually 1/2 the size of the image because it is only Y?? //here the full U Y V Y is stored // UVwp0= (vector unsigned short)vec_mule(wp[0],one); Ywp0 = (vector unsigned short)vec_mulo(wp[0],one); // UVwp1= (vector unsigned short)vec_mule(wp[1],one); Ywp1 = (vector unsigned short)vec_mulo(wp[1],one); //store the current pixels as the history for next time wp[0]=grey0; wp++; wp[0]=grey1; wp++; temp0 = vec_abs(vec_sub((vector signed short)Y0,(vector signed short)Ywp0)); Y0 = (vector unsigned short)vec_cmpgt(temp0,thresh); temp1 = vec_abs(vec_sub((vector signed short)Y1,(vector signed short)Ywp1)); Y1 = (vector unsigned short)vec_cmpgt(temp1,thresh); hiImage0 = vec_mergeh(UV0,Y0); loImage0 = vec_mergel(UV0,Y0); hiImage1 = vec_mergeh(UV1,Y1); loImage1 = vec_mergel(UV1,Y1); grey0 = vec_packsu(hiImage0,loImage0); grey1 = vec_packsu(hiImage1,loImage1); rp[0]=grey0; rp++; rp[0]=grey1; rp++; // grey = rp[0]; // rp[Y1]=255*(abs(grey-*wp)>thresh); // *wp++=grey; // rp+=4; // rp++; } # ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); # endif }
src += stride; #define noop(a) a #define add28(a) vec_add(v28ss, a) static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { DECLARE_ALIGNED(16, signed int, ABCD)[4] = {((8 - x) * (8 - y)), (( x) * (8 - y)), ((8 - x) * ( y)), (( x) * ( y))}; register int i; vec_u8 fperm; const vec_s32 vABCD = vec_ld(0, ABCD); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); LOAD_ZERO; const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_u16 v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; vec_u8 vsrc0uc, vsrc1uc; vec_s16 vsrc0ssH, vsrc1ssH; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; vec_s16 vsrc2ssH, vsrc3ssH, psum; vec_u8 vdst, ppsum, vfdst, fsum;
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s __attribute__((aligned(16))); uint8_t *pix3 = pix2 + line_size; const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2); vector unsigned char *tv, avgv, t5; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; vector unsigned short avghv, avglv; vector unsigned short t1, t2, t3, t4; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); s = 0; /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, as well as some splitting, and vector addition each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); t1 = vec_add(pix2hv, pix2ihv); t2 = vec_add(pix2lv, pix2ilv); for(i=0;i<h;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); tv = (vector unsigned char *) &pix3[1]; pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); /* Note that Altivec does have vec_avg, but this works on vector pairs and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. Instead, we have to split the pixel vectors into vectors of shorts, and do the averaging by hand. */ /* Split the pixel vectors into shorts */ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); /* Do the averaging on them */ t3 = vec_add(pix3hv, pix3ihv); t4 = vec_add(pix3lv, pix3ilv); avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); /* Pack the shorts back into a result */ avgv = vec_pack(avghv, avglv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix3 += line_size; /* Transfer the calculated values for pix3 into pix2 */ t1 = t3; t2 = t4; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }