void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) { vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; vec_s16 tmp2, tmp3; vec_u8 perm0; vec_u8 perm1; vec_u8 p0, p1, p; IDCT p0 = vec_lvsl (0, dest); p1 = vec_lvsl (stride, dest); p = vec_splat_u8 (-1); perm0 = vec_mergeh (p, p0); perm1 = vec_mergeh (p, p1); #define ADD(dest,src,perm) \ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ tmp = vec_ld (0, dest); \ tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ tmp3 = vec_adds (tmp2, src); \ tmp = vec_packsu (tmp3, tmp3); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); ADD (dest, vx0, perm0) dest += stride; ADD (dest, vx1, perm1) dest += stride; ADD (dest, vx2, perm0) dest += stride; ADD (dest, vx3, perm1) dest += stride; ADD (dest, vx4, perm0) dest += stride; ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
/* next one assumes that ((line_size % 8) == 0) */ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2; register vector unsigned short pixelssum1, pixelssum2, temp3; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum1 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); temp1 = vec_ld(line_size, pixels); temp2 = vec_ld(line_size + 16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum2 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); temp3 = vec_add(pixelssum1, pixelssum2); temp3 = vec_sra(temp3, vctwo); pixelssum1 = vec_add(pixelssum2, vcone); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); if (rightside) { blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); } else { blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); } vec_st(blockv, 0, block); block += line_size; pixels += line_size; } }
uint32_t sad16bi_altivec_c(vector unsigned char *cur, vector unsigned char *ref1, vector unsigned char *ref2, uint32_t stride) { vector unsigned char t1, t2; vector unsigned char mask1, mask2; vector unsigned char sad; vector unsigned int sum; uint32_t result; #ifdef DEBUG /* print alignment errors if this is on */ if((long)cur & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur); if(stride & 0xf) fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride); #endif /* Initialisation stuff */ stride >>= 4; mask1 = vec_lvsl(0, (unsigned char*)ref1); mask2 = vec_lvsl(0, (unsigned char*)ref2); sad = vec_splat_u8(0); sum = (vector unsigned int)sad; SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); SAD16BI(); sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0)); sum = vec_splat(sum, 3); vec_ste(sum, 0, (uint32_t*)&result); return result; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ tv = (vector unsigned char *) pix; pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static void _twin_fbdev_vec_put_span (twin_coord_t left, twin_coord_t top, twin_coord_t right, twin_argb32_t *pixels, void *closure) { twin_fbdev_t *tf = closure; twin_coord_t width = right - left; unsigned int *dest; vector unsigned char edgeperm; vector unsigned char src0v, src1v, srcv; if (!tf->active || tf->fb_base == MAP_FAILED) return; dest = (unsigned int *)(tf->fb_ptr + top * tf->fb_fix.line_length); dest += left; while((((unsigned long)dest) & 0xf) && width--) *(dest++) = *(pixels++); edgeperm = vec_lvsl (0, pixels); src0v = vec_ld (0, pixels); while(width >= 4) { src1v = vec_ld (16, pixels); srcv = vec_perm (src0v, src1v, edgeperm); vec_st ((vector unsigned int)srcv, 0, dest); src0v = src1v; dest += 4; pixels += 4; width -= 4; } while(width--) *(dest++) = *(pixels++); }
void ff_vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) { LOAD_ZERO; vec_u8 t, vdst; vec_s16 vdst_16; vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); IDCT_START IDCT_1D(NOP, NOP) TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); IDCT_1D(ADD8, SHIFT4) #define ADD(a)\ vdst = vec_ld(0, dst);\ vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ vdst_16 = vec_adds(a, vdst_16);\ t = vec_packsu(vdst_16, vdst_16);\ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ vec_ste((vec_u32)t, 4, (unsigned int *)dst); ADD(b0) dst += stride; ADD(b1) dst += stride; ADD(b2) dst += stride; ADD(b3) dst += stride; ADD(b4) dst += stride; ADD(b5) dst += stride; ADD(b6) dst += stride; ADD(b7) }
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) { int i; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst) & 15) //FIXME for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); d = float_to_int16_one_altivec(src + i); d1 = vec_ld(15, dst + i); d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst + i); vec_st(d1, 15, dst + i); } else for(i = 0; i < len - 7; i += 8) { d = float_to_int16_one_altivec(src + i); vec_st(d, 0, dst + i); } }
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] ) { vec_u16_t onev = vec_splat_u16(1); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3; s0 = vec_ld( 0x00, dct ); s1 = vec_sld( s0, s0, 8 ); s2 = vec_ld( 0x10, dct ); s3 = vec_sld( s2, s2, 8 ); vec_s16_t d0, d1, d2, d3; IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 ); vec_s16_t tr0, tr1, tr2, tr3; VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 ); vec_s16_t idct0, idct1, idct2, idct3; IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 ); vec_u8_t perm_ldv = vec_lvsl( 0, dst ); vec_u16_t sixv = vec_splat_u16(6); LOAD_ZERO; ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv ); ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv ); }
void transfer_16to8copy_altivec_c(uint8_t *dst, vector signed short *src, uint32_t stride) { register vector signed short s; register vector unsigned char packed; register vector unsigned char mask_stencil; register vector unsigned char mask; register vector unsigned char load_src_perm; #ifdef DEBUG /* if this is on, print alignment errors */ if(((unsigned long) dst) & 0x7) fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, dst %lx\n", (long)dst); if(stride & 0x7) fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, stride %u\n", stride); #endif /* Initialisation stuff */ load_src_perm = vec_lvsl(0, (unsigned char*)src); mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i, s = 0; const vector unsigned int zero = (const vector unsigned int) vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); vector signed int sum; for (i = 0; i < 16; i++) { /* Read the potentially unaligned pixels. */ vector unsigned char pixl = vec_ld(0, pix); vector unsigned char pixr = vec_ld(15, pix); vector unsigned char pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum. */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s. */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
void float_to_int16_altivec(int16_t *dst, const float *src, int len) { int i; vector float s0, s1; vector signed int t0, t1; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); d0 = vec_ld(0, dst+i); t1 = vec_cts(s1, 0); d1 = vec_ld(15, dst+i); d = vec_packs(t0,t1); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); t1 = vec_cts(s1, 0); d = vec_packs(t0,t1); vec_st(d, 0, dst+i); } }
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16*)v1; vec_s16 *pv2 = (vec_s16*)v2; vec_s16 *pv3 = (vec_s16*)v3; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 t0, t1, i0, i1; register vec_s16 i2 = pv2[0], i3 = pv3[0]; register vec_s32 res = zero_s32v; register vec_u8 align = vec_lvsl(0, v2); int32_t ires; order >>= 4; do { t0 = vec_perm(i2, pv2[1], align); i2 = pv2[2]; t1 = vec_perm(pv2[1], i2, align); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); t0 = vec_perm(i3, pv3[1], align); i3 = pv3[2]; t1 = vec_perm(pv3[1], i3, align); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; pv2 += 2; pv3 += 2; } while(--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
static int pix_sum_altivec(uint8_t * pix, int line_size) { const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; int s; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); t1 = vec_perm(pixl, pixr, perm); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t1, sad); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
/* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; int i; for (i = 0; i < h; i++) { /* block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */ int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16, pixels); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); if (rightside) { pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); } else { pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); } blockv = vec_avg(blockv, pixelsv); vec_st(blockv, 0, block); pixels += line_size; block += line_size; } }
static av_always_inline void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int w, int is6tap) { LOAD_H_SUBPEL_FILTER(mx-1); vec_u8 align_vec0, align_vec8, permh0, permh8, filt; vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; vec_u8 a, b, pixh, pixl, outer; vec_s16 f16h, f16l; vec_s32 filth, filtl; vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); vec_u16 c7 = vec_splat_u16(7); align_vec0 = vec_lvsl( -is6tap-1, src); align_vec8 = vec_lvsl(8-is6tap-1, src); permh0 = vec_perm(align_vec0, align_vec0, perm_inner); permh8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_inner = vec_add(perm_inner, vec_splat_u8(4)); perml0 = vec_perm(align_vec0, align_vec0, perm_inner); perml8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); while (h --> 0) { FILTER_H(f16h, 0); if (w == 16) { FILTER_H(f16l, 8); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); if (w == 8) vec_ste((vec_u32)filt, 4, (uint32_t*)dst); } src += src_stride; dst += dst_stride; } }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, long len, int channels) { int i; vector signed short d0, d1, d2, c0, c1, t0, t1; vector unsigned char align; if(channels == 1) float_to_int16_altivec(dst, src[0], len); else if (channels == 2) { if(((long)dst) & 15) for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); t0 = float_to_int16_one_altivec(src[0] + i); d1 = vec_ld(31, dst + i); t1 = float_to_int16_one_altivec(src[1] + i); c0 = vec_mergeh(t0, t1); c1 = vec_mergel(t0, t1); d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d2, c0, align); d1 = vec_perm(c0, c1, align); vec_st(d0, 0, dst + i); d0 = vec_perm(c1, d2, align); vec_st(d1, 15, dst + i); vec_st(d0, 31, dst + i); dst += 8; } else for(i = 0; i < len - 7; i += 8) { t0 = float_to_int16_one_altivec(src[0] + i); t1 = float_to_int16_one_altivec(src[1] + i); d0 = vec_mergeh(t0, t1); d1 = vec_mergel(t0, t1); vec_st(d0, 0, dst + i); vec_st(d1, 16, dst + i); dst += 8; } } else { DECLARE_ALIGNED(16, int16_t, tmp)[len]; int c, j; for (c = 0; c < channels; c++) { float_to_int16_altivec(tmp, src[c], len); for (i = 0, j = c; i < len; i++, j += channels) { dst[j] = tmp[i]; } } } }
uint32_t sad16_altivec_c(vector unsigned char *cur, vector unsigned char *ref, uint32_t stride, const uint32_t best_sad) { vector unsigned char perm; vector unsigned char t1, t2; vector unsigned int sad; vector unsigned int sumdiffs; vector unsigned int best_vec; uint32_t result; #ifdef DEBUG /* print alignment errors if DEBUG is on */ if (((unsigned long) cur) & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); if (stride & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); #endif /* initialization */ sad = vec_splat_u32(0); sumdiffs = sad; stride >>= 4; perm = vec_lvsl(0, (unsigned char *) ref); *((uint32_t*)&best_vec) = best_sad; best_vec = vec_splat(best_vec, 0); /* perform sum of differences between current and previous */ SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); bail: /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, (uint32_t*) &result); return result; }
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; vector signed int sumdiffs; int tmpsad, sad = 0; int y, x; for (y = 0; y < height; ++y) { vector unsigned char perm1, perm2; perm1 = vec_lvsl(0, &data1[y * stride1]); perm2 = vec_lvsl(0, &data2[y * stride2]); for (x = 0; x <= width-16; x+=16) { vector unsigned char t1, t2, t3, t4, t5; vector unsigned char *current, *previous; current = (vector unsigned char *) &data1[y * stride1 + x]; previous = (vector unsigned char *) &data2[y * stride2 + x]; t1 = vec_perm(current[0], current[1], perm1 ); /* align current vector */ t2 = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ t3 = vec_max(t1, t2 ); /* find largest of two */ t4 = vec_min(t1, t2 ); /* find smaller of two */ t5 = vec_sub(t3, t4); /* find absolute difference */ vsad = vec_sum4s(t5, vsad); /* accumulate sum of differences */ } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero); /* copy vector sum into unaligned result */ sumdiffs = vec_splat( sumdiffs, 3); vec_ste( sumdiffs, 0, &tmpsad ); sad += tmpsad; return sad; }
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); for (i = 0; i < h; i++) { tmp1 = vec_ld(i * src_stride1, src1); mask = vec_lvsl(i * src_stride1, src1); tmp2 = vec_ld(i * src_stride1 + 15, src1); a = vec_perm(tmp1, tmp2, mask); tmp1 = vec_ld(i * 16, src2); tmp2 = vec_ld(i * 16 + 15, src2); b = vec_perm(tmp1, tmp2, mask_); tmp1 = vec_ld(0, dst); mask = vec_lvsl(0, dst); tmp2 = vec_ld(15, dst); d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); edges = vec_perm(tmp2, tmp1, mask); align = vec_lvsr(0, dst); tmp2 = vec_perm(d, edges, align); tmp1 = vec_perm(edges, d, align); vec_st(tmp2, 15, dst); vec_st(tmp1, 0 , dst); dst += dst_stride; } }
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { register int i, j; DECLARE_ALIGNED(16, int, val)[16]; vector signed int vo1, vo2, vo3, vo4; vector unsigned short vs1, vs2; vector unsigned char vf; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); for (i = 0; i < 16; i++) val[i] = dither[(x + i + offset) & 7] << 12; vo1 = vec_ld(0, val); vo2 = vec_ld(16, val); vo3 = vec_ld(32, val); vo4 = vec_ld(48, val); for (j = 0; j < filterSize; j++) { vector signed short l1, vLumFilter = vec_ld(j << 1, filter); vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter); vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter perm = vec_lvsl(x << 1, src[j]); l1 = vec_ld(x << 1, src[j]); yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter); yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter); } vo1 = vec_sra(vo1, altivec_vectorShiftInt19); vo2 = vec_sra(vo2, altivec_vectorShiftInt19); vo3 = vec_sra(vo3, altivec_vectorShiftInt19); vo4 = vec_sra(vo4, altivec_vectorShiftInt19); vs1 = vec_packsu(vo1, vo2); vs2 = vec_packsu(vo3, vo4); vf = vec_packsu(vs1, vs2); vec_st(vf, 0, dest); }
void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) { POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; vec_s16 tmp2, tmp3; vec_u8 perm0; vec_u8 perm1; vec_u8 p0, p1, p; #if CONFIG_POWERPC_PERF POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); #endif IDCT p0 = vec_lvsl (0, dest); p1 = vec_lvsl (stride, dest); p = vec_splat_u8 (-1); perm0 = vec_mergeh (p, p0); perm1 = vec_mergeh (p, p1); #define ADD(dest,src,perm) \ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ tmp = vec_ld (0, dest); \ tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ tmp3 = vec_adds (tmp2, src); \ tmp = vec_packsu (tmp3, tmp3); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); ADD (dest, vx0, perm0) dest += stride; ADD (dest, vx1, perm1) dest += stride; ADD (dest, vx2, perm0) dest += stride; ADD (dest, vx3, perm1) dest += stride; ADD (dest, vx4, perm0) dest += stride; ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static void test() { vector unsigned char expected = {4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19}; check (vec_all_eq(vec_lvsl(0, &sc[4]), expected), "sc"); check (vec_all_eq(vec_lvsl(0, &uc[4]), expected), "uc"); check (vec_all_eq(vec_lvsl(0, &ss[2]), expected), "ss"); check (vec_all_eq(vec_lvsl(0, &us[2]), expected), "us"); check (vec_all_eq(vec_lvsl(0, &si[1]), expected), "si"); check (vec_all_eq(vec_lvsl(0, &ui[1]), expected), "ui"); check (vec_all_eq(vec_lvsl(0, & f[1]), expected), "f"); }
/* Load a vector from an unaligned location in memory */ static inline vector unsigned char LoadUnaligned(const guchar *v) { if ((long)v & 0x0f) { vector unsigned char permuteVector = vec_lvsl(0, v); vector unsigned char low = vec_ld(0, v); vector unsigned char high = vec_ld(16, v); return vec_perm(low, high, permuteVector); } else return vec_ld(0, v); /* don't want overflow */ }
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) { int i; register vec_s16_t vec, *pv; for(i = 0; i < order; i += 8){ pv = (vec_s16_t*)v2; vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); v1 += 8; v2 += 8; } }