static void predict_16x16_dc_128_altivec( uint8_t *src ) { /* test if generating the constant is faster than loading it. vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080); */ vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7)); PREDICT_16x16_DC_ALTIVEC(bc_v); }
int foo(volatile vector float &i, int &j) { vector unsigned char zero = vec_splat_u8(0); vector unsigned char one = vec_splat_u8(1); i = vec_add( (vector float)zero, (vector float)one ); j = 5; return 0; }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
void ff_vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) { LOAD_ZERO; vec_u8 t, vdst; vec_s16 vdst_16; vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); IDCT_START IDCT_1D(NOP, NOP) TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); IDCT_1D(ADD8, SHIFT4) #define ADD(a)\ vdst = vec_ld(0, dst);\ vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ vdst_16 = vec_adds(a, vdst_16);\ t = vec_packsu(vdst_16, vdst_16);\ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ vec_ste((vec_u32)t, 4, (unsigned int *)dst); ADD(b0) dst += stride; ADD(b1) dst += stride; ADD(b2) dst += stride; ADD(b3) dst += stride; ADD(b4) dst += stride; ADD(b5) dst += stride; ADD(b6) dst += stride; ADD(b7) }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
void transfer_8to16subro_altivec_c(int16_t * dct, const uint8_t * cur, const uint8_t * ref, const uint32_t stride) { register vector unsigned char c; register vector unsigned char r; register vector unsigned char z; register vector signed short cs; register vector signed short rs; #ifdef DEBUG /* Check the alignment assumptions if this is on */ if((long)dct & 0xf) fprintf(stderr, "transfer_8to16subro_altivec_c:incorrect align, dct: %lx\n", (long)dct); #endif /* initialize */ z = vec_splat_u8(0); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); SUBRO8TO16(); }
void transfer_8to16sub_altivec_c(int16_t * dct, uint8_t * cur, uint8_t * ref, const uint32_t stride) { register vector unsigned char c,r; register vector unsigned char ox00; register vector unsigned char mask_00ff; register vector unsigned char mask; register vector signed short cs,rs; #ifdef DEBUG if((long)dct & 0xf) fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, dct: %lx\n", (long)dct); if((long)cur & 0x7) fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, cur: %lx\n", (long)cur); if(stride & 0x7) fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect stride, stride: %lu\n", (long)stride); #endif /* initialize */ ox00 = vec_splat_u8(0); mask_00ff = vec_pack((vector unsigned short)ox00,vec_splat_u16(-1)); SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16(); }
void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) { vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; vec_s16 tmp2, tmp3; vec_u8 perm0; vec_u8 perm1; vec_u8 p0, p1, p; IDCT p0 = vec_lvsl (0, dest); p1 = vec_lvsl (stride, dest); p = vec_splat_u8 (-1); perm0 = vec_mergeh (p, p0); perm1 = vec_mergeh (p, p1); #define ADD(dest,src,perm) \ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ tmp = vec_ld (0, dest); \ tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ tmp3 = vec_adds (tmp2, src); \ tmp = vec_packsu (tmp3, tmp3); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); ADD (dest, vx0, perm0) dest += stride; ADD (dest, vx1, perm1) dest += stride; ADD (dest, vx2, perm0) dest += stride; ADD (dest, vx3, perm1) dest += stride; ADD (dest, vx4, perm0) dest += stride; ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) }
void transfer_8to16copy_altivec_c(int16_t *dst, uint8_t * src, uint32_t stride) { register vector unsigned char s; register vector unsigned char zerovec; #ifdef DEBUG /* Check the alignment */ if((long)dst & 0xf) fprintf(stderr, "transfer_8to16copy_altivec_c:incorrect align, dst: %lx\n", (long)dst); #endif /* initialization */ zerovec = vec_splat_u8(0); COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16(); }
void imageFilterMean_Altivec(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { MEAN_PIXEL(); --n; ++dst; ++src1; ++src2; } // Do bulk of processing using Altivec (find the mean of 16 8-bit unsigned integers, with saturation) vector unsigned char rshft = vec_splat_u8(0x1); while(n >= 16) { vector unsigned char s1 = vec_ld(0,src1); s1 = vec_sr(s1, rshft); // shift right 1 vector unsigned char s2 = vec_ld(0,src2); s2 = vec_sr(s2, rshft); // shift right 1 vector unsigned char r = vec_adds(s1, s2); vec_st(r,0,dst); n -= 16; src1 += 16; src2 += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_MEAN(); }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
/* next one assumes that ((line_size % 8) == 0) */ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2; register vector unsigned short pixelssum1, pixelssum2, temp3; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum1 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); temp1 = vec_ld(line_size, pixels); temp2 = vec_ld(line_size + 16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum2 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); temp3 = vec_add(pixelssum1, pixelssum2); temp3 = vec_sra(temp3, vctwo); pixelssum1 = vec_add(pixelssum2, vcone); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); if (rightside) { blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); } else { blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); } vec_st(blockv, 0, block); block += line_size; pixels += line_size; } }
uint32_t quant_h263_inter_altivec_c(int16_t *coeff, int16_t *data, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector unsigned char zerovec; vector unsigned short mult; vector unsigned short quant_m_2; vector unsigned short quant_d_2; vector unsigned short sum_short; vector signed short acLevel; vector unsigned int even; vector unsigned int odd; vector bool short m2_mask; vector bool short zero_mask; uint32_t result; #ifdef DEBUG if(((unsigned)coeff) & 0x15) fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff); #endif /* initialisation stuff */ zerovec = vec_splat_u8(0); *((unsigned short*)&mult) = (unsigned short)multipliers[quant]; mult = vec_splat(mult, 0); *((unsigned short*)&quant_m_2) = (unsigned short)quant; quant_m_2 = vec_splat(quant_m_2, 0); quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1)); *((unsigned short*)&quant_d_2) = (unsigned short)quant; quant_d_2 = vec_splat(quant_d_2, 0); quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1)); sum_short = (vector unsigned short)zerovec; /* Quantize */ QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); /* Calculate the return value */ even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec); even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec); even = vec_splat(even, 3); vec_ste(even, 0, &result); return result; }
void YV12_422_Altivec( uint8_t *in, uint8_t *out, uint32_t w,uint32_t h) { uint8_t *y,*y2,*u,*v,*out2; uint32_t dx,dy; vector unsigned char vecy,vecy2,vecu,vecv,MSQ,mask; vector unsigned char zero; #define VEC16 vector unsigned short #define VEC8 vector unsigned char #define VECS8 vector signed char out2=out+w*2; y=in; y2=in+w; u=in+w*h; v=in+((w*h*5)>>2); zero=vec_splat_u8(0); if( (long int)out & 15) { printf("Alignment issue in yv12 to 422 altivec!\n"); } for(dy=h>>1;dy>0;dy--) { // We do 4 pix in a raw for(dx=w>>3;dx>0;dx--) { LOAD_ALIGN(vecy,y); // expand LOAD_ALIGN(vecy2,y2); // expand LOAD_ALIGN(vecu,v); // expand LOAD_ALIGN(vecv,u); // expand vecu=(VEC8)vec_mergeh(vecu,vecv); vecy=(VEC8)vec_mergeh(vecy,vecu); vecy2=(VEC8)vec_mergeh(vecy2,vecu); // Store vec_st(vecy,0,out); vec_st(vecy2,0,out2); // next out2+=16; out+=16; y+=8; y2+=8; u+=4; v+=4; } out+=w*2; out2+=w*2; y+=w; y2+=w; } }
void foo (void) { vector bool int boolVec1 = (vector bool int) vec_splat_u32(3); vector bool short boolVec2 = (vector bool short) vec_splat_u16(3); vector bool char boolVec3 = (vector bool char) vec_splat_u8(3); boolVec1 = vec_sld( boolVec1, boolVec1, 4 ); boolVec2 = vec_sld( boolVec2, boolVec2, 2 ); boolVec3 = vec_sld( boolVec3, boolVec3, 1 ); }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static inline void abcd2cbad_internal( register const vector unsigned char p, unsigned char *data, unsigned int length, unsigned char *newdata ) { register vector unsigned char d0,d1,d2,z; z = vec_splat_u8(0); length = eround16(length); if( length >= 3 ) { length -= 3; d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); while( length >= 3 ) { d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); d2 = vec_perm(d2,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); vec_st(d2,32,newdata); length -= 3; data += 16*3; newdata += 16*3; d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); } d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); d2 = vec_perm(d2,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); vec_st(d2,32,newdata); } if( length == 2 ) { d0 = vec_ld(0,data); d1 = vec_ld(16,data); d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); } else if( length == 1 ) { d0 = vec_ld(0,data); d0 = vec_perm(d0,d0,z); vec_st(d0,0,newdata); } }
uint32_t dequant_h263_inter_altivec_c(int16_t *data, int16_t *coeff, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector signed short acLevel; vector signed short vec_2048; vector unsigned short quant_m_2; vector unsigned short quant_add; vector unsigned short t; register vector unsigned int even; register vector unsigned int odd; register vector unsigned int high; register vector unsigned int low; register vector unsigned char zerovec; vector bool short equal_zero; vector bool short less_zero; vector bool short overflow; #ifdef DEBUG /* print alignment errors if this is on */ if(((unsigned)data) & 0x15) fprintf(stderr, "dequant_h263_inter_altivec_c:incorrect align, data: %lx\n", (long)data); #endif /* initialize */ *((unsigned short*)&quant_m_2) = (unsigned short)(quant << 1); quant_m_2 = vec_splat(quant_m_2,0); *((unsigned short*)&quant_add) = (unsigned short)(quant & 1 ? quant : quant - 1); quant_add = vec_splat(quant_add,0); vec_2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); zerovec = vec_splat_u8(0); /* dequant */ DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); return 0; }
uint32_t sad16bi_altivec_c(vector unsigned char *cur, vector unsigned char *ref1, vector unsigned char *ref2, uint32_t stride) { vector unsigned char t1, t2; vector unsigned char mask1, mask2; vector unsigned char sad; vector unsigned int sum; uint32_t result; #ifdef DEBUG /* print alignment errors if this is on */ if((long)cur & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); if(stride & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); #endif /* Initialisation stuff */ stride >>= 4; mask1 = vec_lvsl(0, (unsigned char*)ref1); mask2 = vec_lvsl(0, (unsigned char*)ref2); sad = vec_splat_u8(0); sum = (vector unsigned int)sad; SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); sum = vec_splat(sum, 3); vec_ste(sum, 0, (uint32_t*)&result); return result; }
uint32_t quant_h263_intra_altivec_c(int16_t *coeff, int16_t *data, const uint32_t quant, const uint32_t dcscalar, const uint16_t *mpeg_quant_matrices) { vector unsigned char zerovec; vector unsigned short mult; vector unsigned short quant_m_2; vector signed short acLevel; register vector unsigned int even; register vector unsigned int odd; vector bool short zero_mask; vector bool short m2_mask; register int16_t *origin_coeff = coeff; register int16_t *origin_data = data; #ifdef DEBUG if(((unsigned)coeff) & 15) fprintf(stderr, "quant_h263_intra_altivec_c:incorrect align, coeff: %lx\n", (long)coeff); #endif zerovec = vec_splat_u8(0); *((unsigned short*)&mult) = (unsigned short)multipliers[quant]; mult = vec_splat(mult, 0); *((unsigned short*)&quant_m_2) = (unsigned short)quant; quant_m_2 = vec_splat(quant_m_2, 0); quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1)); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); QUANT_H263_INTRA_ALTIVEC(); // noch erstes setzen origin_coeff[0] = DIV_DIV(origin_data[0], (int32_t)dcscalar); return 0; }
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize / 4; vector signed short hiImage, loImage, hiRight, loRight; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char *)image.data; vector unsigned char *rightData = (vector unsigned char *)right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif do { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif hiImage = (vector signed short)vec_mergeh(zero,inData[0]); loImage = (vector signed short)vec_mergel(zero,inData[0]); hiRight = (vector signed short)vec_mergeh(zero,rightData[0]); loRight = (vector signed short)vec_mergel(zero,rightData[0]); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } while (--datasize); #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); #endif }
static av_always_inline void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int w, int is6tap) { LOAD_H_SUBPEL_FILTER(mx-1); vec_u8 align_vec0, align_vec8, permh0, permh8, filt; vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; vec_u8 a, b, pixh, pixl, outer; vec_s16 f16h, f16l; vec_s32 filth, filtl; vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); vec_u16 c7 = vec_splat_u16(7); align_vec0 = vec_lvsl( -is6tap-1, src); align_vec8 = vec_lvsl(8-is6tap-1, src); permh0 = vec_perm(align_vec0, align_vec0, perm_inner); permh8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_inner = vec_add(perm_inner, vec_splat_u8(4)); perml0 = vec_perm(align_vec0, align_vec0, perm_inner); perml8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); while (h --> 0) { FILTER_H(f16h, 0); if (w == 16) { FILTER_H(f16l, 8); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); if (w == 8) vec_ste((vec_u32)filt, 4, (uint32_t*)dst); } src += src_stride; dst += dst_stride; } }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) { POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; vec_s16 tmp2, tmp3; vec_u8 perm0; vec_u8 perm1; vec_u8 p0, p1, p; #if CONFIG_POWERPC_PERF POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); #endif IDCT p0 = vec_lvsl (0, dest); p1 = vec_lvsl (stride, dest); p = vec_splat_u8 (-1); perm0 = vec_mergeh (p, p0); perm1 = vec_mergeh (p, p1); #define ADD(dest,src,perm) \ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ tmp = vec_ld (0, dest); \ tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ tmp3 = vec_adds (tmp2, src); \ tmp = vec_packsu (tmp3, tmp3); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); ADD (dest, vx0, perm0) dest += stride; ADD (dest, vx1, perm1) dest += stride; ADD (dest, vx2, perm0) dest += stride; ADD (dest, vx3, perm1) dest += stride; ADD (dest, vx4, perm0) dest += stride; ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); }
static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64]) { LOAD_ZERO; vec_u8 t, vdst; vec_s16 vdst_16; vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); IDCT_START IDCT_1D(NOP, NOP) TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); IDCT_1D(ADD8, SHIFT4) #if HAVE_BIGENDIAN #define GET_VDST16\ vdst = vec_ld(0, dst);\ vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask); #else #define GET_VDST16\ vdst = vec_vsx_ld(0,dst);\ vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v); #endif #define ADD(a)\ GET_VDST16;\ vdst_16 = vec_adds(a, vdst_16);\ t = vec_packsu(vdst_16, vdst_16);\ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ vec_ste((vec_u32)t, 4, (unsigned int *)dst); ADD(b0) dst += stride; ADD(b1) dst += stride; ADD(b2) dst += stride; ADD(b3) dst += stride; ADD(b4) dst += stride; ADD(b5) dst += stride; ADD(b6) dst += stride; ADD(b7) memset(block, 0, sizeof(*block) * 64); }
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8 * to preserve proper dst alignment. */ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { int i; const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = { (16 - x16) * (16 - y16), /* A */ (x16) * (16 - y16), /* B */ (16 - x16) * (y16), /* C */ (x16) * (y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const vector unsigned char vczero = (const vector unsigned char) vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short) vec_splat_u16(8); register vector unsigned char dstv, dstv2, srcvB, srcvC, srcvD; register vector unsigned short tempB, tempC, tempD; unsigned long dst_odd = (unsigned long) dst & 0x0000000F; unsigned long src_really_odd = (unsigned long) src & 0x0000000F; register vector unsigned short tempA = vec_ld(0, (const unsigned short *) ABCD); register vector unsigned short Av = vec_splat(tempA, 0); register vector unsigned short Bv = vec_splat(tempA, 1); register vector unsigned short Cv = vec_splat(tempA, 2); register vector unsigned short Dv = vec_splat(tempA, 3); register vector unsigned short rounderV = vec_splat((vec_u16) vec_lde(0, &rounder_a), 0); /* we'll be able to pick-up our 9 char elements at src from those * 32 bytes we load the first batch here, as inside the loop we can * reuse 'src + stride' from one iteration as the 'src' of the next. */ register vector unsigned char src_0 = vec_ld(0, src); register vector unsigned char src_1 = vec_ld(16, src); register vector unsigned char srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); else srcvB = src_1; srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for (i = 0; i < h; i++) { dst_odd = (unsigned long) dst & 0x0000000F; src_really_odd = (((unsigned long) src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); /* We'll be able to pick-up our 9 char elements at src + stride from * those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD * as the next srcvA and srcvB. */ src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); else srcvD = src_1; srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); /* OK, now we (finally) do the math :-) * Those four instructions replace 32 int muls & 32 int adds. * Isn't AltiVec nice? */ tempA = vec_mladd((vector unsigned short) srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short) srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short) srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short) srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short) vczero); if (dst_odd) dstv2 = vec_perm(dstv, dstv2, vcprm(0, 1, s0, s1)); else dstv2 = vec_perm(dstv, dstv2, vcprm(s0, s1, 2, 3)); vec_st(dstv2, 0, dst); dst += stride; src += stride; } }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = {rounder, rounder, rounder, rounder, rounder, rounder, rounder, rounder}; const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = { (16-x16)*(16-y16), /* A */ ( x16)*(16-y16), /* B */ (16-x16)*( y16), /* C */ ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; int i; unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); Cv = vec_splat(tempA, 2); Dv = vec_splat(tempA, 3); rounderV = vec_ld(0, (unsigned short*)rounder_a); // we'll be able to pick-up our 9 char elements // at src from those 32 bytes // we load the first batch here, as inside the loop // we can re-use 'src+stride' from one iteration // as the 'src' of the next. src_0 = vec_ld(0, src); src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); } else { srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for(i=0; i<h; i++) { dst_odd = (unsigned long)dst & 0x0000000F; src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); // we we'll be able to pick-up our 9 char elements // at src + stride from those 32 bytes // then reuse the resulting 2 vectors srvcC and srcvD // as the next srcvA and srcvB src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); } else { srcvD = src_1; } srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); // OK, now we (finally) do the math :-) // those four instructions replaces 32 int muls & 32 int adds. // isn't AltiVec nice ? tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short)vczero); if (dst_odd) { dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); } else { dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); } vec_st(dstv2, 0, dst); dst += stride; src += stride; } POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); }
vector unsigned char testuc_3 () { return vec_splat_u8 (15); }
vector unsigned char testuc_2 () { return vec_splat_u8 (-5); }