static void apply_window(const float *buf, const float *win1, const float *win2, float *sum1, float *sum2, int len) { const vector float *win1a = (const vector float *) win1; const vector float *win2a = (const vector float *) win2; const vector float *bufa = (const vector float *) buf; vector float *sum1a = (vector float *) sum1; vector float *sum2a = (vector float *) sum2; vector float av_uninit(v0), av_uninit(v4); vector float v1, v2, v3; len = len >> 2; #define MULT(a, b) \ { \ v1 = vec_ld(a, win1a); \ v2 = vec_ld(b, win2a); \ v3 = vec_ld(a, bufa); \ v0 = vec_madd(v3, v1, v0); \ v4 = vec_madd(v2, v3, v4); \ } while (len--) { v0 = vec_xor(v0, v0); v4 = vec_xor(v4, v4); MULT( 0, 0); MULT( 256, 64); MULT( 512, 128); MULT( 768, 192); MULT(1024, 256); MULT(1280, 320); MULT(1536, 384); MULT(1792, 448); vec_st(v0, 0, sum1a); vec_st(v4, 0, sum2a); sum1a++; sum2a++; win1a++; win2a++; bufa++; } }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
static void clear_block_altivec(int16_t *block) { LOAD_ZERO; vec_st(zero_s16v, 0, block); vec_st(zero_s16v, 16, block); vec_st(zero_s16v, 32, block); vec_st(zero_s16v, 48, block); vec_st(zero_s16v, 64, block); vec_st(zero_s16v, 80, block); vec_st(zero_s16v, 96, block); vec_st(zero_s16v, 112, block); }
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); for (i = 0; i < h; i++) { tmp1 = vec_ld(i * src_stride1, src1); mask = vec_lvsl(i * src_stride1, src1); tmp2 = vec_ld(i * src_stride1 + 15, src1); a = vec_perm(tmp1, tmp2, mask); tmp1 = vec_ld(i * 16, src2); tmp2 = vec_ld(i * 16 + 15, src2); b = vec_perm(tmp1, tmp2, mask_); tmp1 = vec_ld(0, dst); mask = vec_lvsl(0, dst); tmp2 = vec_ld(15, dst); d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); edges = vec_perm(tmp2, tmp1, mask); align = vec_lvsr(0, dst); tmp2 = vec_perm(d, edges, align); tmp1 = vec_perm(edges, d, align); vec_st(tmp2, 15, dst); vec_st(tmp1, 0 , dst); dst += dst_stride; } }
static void predict_16x16_h_altivec( uint8_t *src ) { for( int i = 0; i < 16; i++ ) { vec_u8_t v = vec_ld(-1, src); vec_u8_t v_v = vec_splat(v, 15); vec_st(v_v, 0, src); src += FDEC_STRIDE; } }
void jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data) { int outrow, outcol; JDIMENSION output_cols = width_blocks * DCTSIZE; JSAMPROW inptr, outptr; __vector unsigned char this0, next0, out; __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; /* Constants */ __vector unsigned short pw_bias = { __4X2(0, 1) }, pw_one = { __8X(1) }; __vector unsigned char even_odd_index = {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, pb_zero = { __16X(0) }; expand_right_edge(input_data, max_v_samp_factor, image_width, output_cols * 2); for (outrow = 0; outrow < v_samp_factor; outrow++) { outptr = output_data[outrow]; inptr = input_data[outrow]; for (outcol = output_cols; outcol > 0; outcol -= 16, inptr += 32, outptr += 16) { this0 = vec_ld(0, inptr); this0 = vec_perm(this0, this0, even_odd_index); this0e = (__vector unsigned short)VEC_UNPACKHU(this0); this0o = (__vector unsigned short)VEC_UNPACKLU(this0); outl = vec_add(this0e, this0o); outl = vec_add(outl, pw_bias); outl = vec_sr(outl, pw_one); if (outcol > 8) { next0 = vec_ld(16, inptr); next0 = vec_perm(next0, next0, even_odd_index); next0e = (__vector unsigned short)VEC_UNPACKHU(next0); next0o = (__vector unsigned short)VEC_UNPACKLU(next0); outh = vec_add(next0e, next0o); outh = vec_add(outh, pw_bias); outh = vec_sr(outh, pw_one); } else outh = vec_splat_u16(0); out = vec_pack(outl, outh); vec_st(out, 0, outptr); } } }
void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ) { PREP_DIFF_8BYTEALIGNED; vec_s16_t dct0v, dct1v, dct2v, dct3v; vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v; vec_u8_t permHighv; VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v ); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v, dct0v, dct1v, dct2v, dct3v ); permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct); vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct); }
/* next one assumes that ((line_size % 16) == 0) */ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2; register vector unsigned char pixelsv1B, pixelsv2B; register vector unsigned char pixelsv1C, pixelsv2C; register vector unsigned char pixelsv1D, pixelsv2D; register vector unsigned char perm = vec_lvsl(0, pixels); int i; register ptrdiff_t line_size_2 = line_size << 1; register ptrdiff_t line_size_3 = line_size + line_size_2; register ptrdiff_t line_size_4 = line_size << 2; // hand-unrolling the loop by 4 gains about 15% // mininum execution time goes from 74 to 60 cycles // it's faster than -funroll-loops, but using // -funroll-loops w/ this is bad - 74 cycles again. // all this is on a 7450, tuning for the 7450 for (i = 0; i < h; i += 4) { pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(15, pixels); pixelsv1B = vec_ld(line_size, pixels); pixelsv2B = vec_ld(15 + line_size, pixels); pixelsv1C = vec_ld(line_size_2, pixels); pixelsv2C = vec_ld(15 + line_size_2, pixels); pixelsv1D = vec_ld(line_size_3, pixels); pixelsv2D = vec_ld(15 + line_size_3, pixels); vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block); vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), line_size, (unsigned char*)block); vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), line_size_2, (unsigned char*)block); vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), line_size_3, (unsigned char*)block); pixels+=line_size_4; block +=line_size_4; } }
void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) { #ifdef TM_USE_OMP #pragma omp parallel { #endif vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; vector4double z0, z1, z2, z3, z4, z5, k; float *s, *r; float ALIGN32 _c; _c = c; __prefetch_by_load(S); __prefetch_by_load(R); k = vec_splats((double)_c); __alignx(16, s); __alignx(16, r); __alignx(16, S); __alignx(16, R); #ifdef TM_USE_OMP #pragma omp for #else #pragma unroll(2) #endif for(int i = 0; i < N; i++) { s=(float*)((spinor32 *) S + i); r=(float*)((spinor32 *) R + i); __prefetch_by_load(S + i + 1); __prefetch_by_stream(1, R + i + 1); x0 = vec_ld(0, r); x1 = vec_ld(0, r+4); x2 = vec_ld(0, r+8); x3 = vec_ld(0, r+12); x4 = vec_ld(0, r+16); x5 = vec_ld(0, r+20); y0 = vec_ld(0, s); y1 = vec_ld(0, s+4); y2 = vec_ld(0, s+8); y3 = vec_ld(0, s+12); y4 = vec_ld(0, s+16); y5 = vec_ld(0, s+20); z0 = vec_madd(k, y0, x0); z1 = vec_madd(k, y1, x1); z2 = vec_madd(k, y2, x2); z3 = vec_madd(k, y3, x3); z4 = vec_madd(k, y4, x4); z5 = vec_madd(k, y5, x5); vec_st(z0, 0, r); vec_st(z1, 0, r+4); vec_st(z2, 0, r+8); vec_st(z3, 0, r+12); vec_st(z4, 0, r+16); vec_st(z5, 0, r+20); } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif return; }
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) { int i; register vec_s16_t vec, *pv; for(i = 0; i < order; i += 8){ pv = (vec_s16_t*)v2; vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); v1 += 8; v2 += 8; } }
void jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) { JSAMPROW elemptr; __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; __vector short out0, out1, out2, out3, out4, out5, out6, out7; /* Constants */ __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; __vector unsigned char pb_zero = { __16X(0) }; LOAD_ROW(0); LOAD_ROW(1); LOAD_ROW(2); LOAD_ROW(3); LOAD_ROW(4); LOAD_ROW(5); LOAD_ROW(6); LOAD_ROW(7); out0 = (__vector short)VEC_UNPACKHU(in0); out1 = (__vector short)VEC_UNPACKHU(in1); out2 = (__vector short)VEC_UNPACKHU(in2); out3 = (__vector short)VEC_UNPACKHU(in3); out4 = (__vector short)VEC_UNPACKHU(in4); out5 = (__vector short)VEC_UNPACKHU(in5); out6 = (__vector short)VEC_UNPACKHU(in6); out7 = (__vector short)VEC_UNPACKHU(in7); out0 = vec_sub(out0, pw_centerjsamp); out1 = vec_sub(out1, pw_centerjsamp); out2 = vec_sub(out2, pw_centerjsamp); out3 = vec_sub(out3, pw_centerjsamp); out4 = vec_sub(out4, pw_centerjsamp); out5 = vec_sub(out5, pw_centerjsamp); out6 = vec_sub(out6, pw_centerjsamp); out7 = vec_sub(out7, pw_centerjsamp); vec_st(out0, 0, workspace); vec_st(out1, 16, workspace); vec_st(out2, 32, workspace); vec_st(out3, 48, workspace); vec_st(out4, 64, workspace); vec_st(out5, 80, workspace); vec_st(out6, 96, workspace); vec_st(out7, 112, workspace); }
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) { register vector unsigned char pixelsv1, pixelsv2; register vector unsigned char pixelsv1B, pixelsv2B; register vector unsigned char pixelsv1C, pixelsv2C; register vector unsigned char pixelsv1D, pixelsv2D; register vector unsigned char perm = vec_lvsl(0, src); int i; register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; // hand-unrolling the loop by 4 gains about 15% // mininum execution time goes from 74 to 60 cycles // it's faster than -funroll-loops, but using // -funroll-loops w/ this is bad - 74 cycles again. // all this is on a 7450, tuning for the 7450 for (i = 0; i < h; i += 4) { pixelsv1 = vec_ld( 0, src); pixelsv2 = vec_ld(15, src); pixelsv1B = vec_ld(sstride, src); pixelsv2B = vec_ld(15 + sstride, src); pixelsv1C = vec_ld(sstride2, src); pixelsv2C = vec_ld(15 + sstride2, src); pixelsv1D = vec_ld(sstride3, src); pixelsv2D = vec_ld(15 + sstride3, src); vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)dst); vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), dstride, (unsigned char*)dst); vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), dstride2, (unsigned char*)dst); vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), dstride3, (unsigned char*)dst); src += sstride4; dst += dstride4; } }
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) { union { vector float v; float s[4]; } mul_u; int i; vector float src1, src2, dst1, dst2, mul_v, zero; zero = (vector float)vec_splat_u32(0); mul_u.s[0] = mul; mul_v = vec_splat(mul_u.v, 0); for(i=0; i<len; i+=8) { src1 = vec_ctf(vec_ld(0, src+i), 0); src2 = vec_ctf(vec_ld(16, src+i), 0); dst1 = vec_madd(src1, mul_v, zero); dst2 = vec_madd(src2, mul_v, zero); vec_st(dst1, 0, dst+i); vec_st(dst2, 16, dst+i); } }
static void predict_16x16_v_altivec( uint8_t *src ) { vec_u32_u v; v.s[0] = *(uint32_t*)&src[ 0-FDEC_STRIDE]; v.s[1] = *(uint32_t*)&src[ 4-FDEC_STRIDE]; v.s[2] = *(uint32_t*)&src[ 8-FDEC_STRIDE]; v.s[3] = *(uint32_t*)&src[12-FDEC_STRIDE]; for( int i = 0; i < 16; i++ ) { vec_st(v.v, 0, (uint32_t*)src); src += FDEC_STRIDE; } }
static void audio_convert_s16_to_float_altivec(float *out, const int16_t *in, size_t samples) { // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely). if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0) { size_t i; for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { vector signed short input = vec_ld(0, in); vector signed int hi = vec_unpackh(input); vector signed int lo = vec_unpackl(input); vector float out_hi = vec_ctf(hi, 15); vector float out_lo = vec_ctf(lo, 15); vec_st(out_hi, 0, out); vec_st(out_lo, 16, out); } audio_convert_s16_to_float_C(out, in, samples - i); } else audio_convert_s16_to_float_C(out, in, samples); }
static void predict_16x16_p_altivec( uint8_t *src ) { int16_t a, b, c, i; int H = 0; int V = 0; int16_t i00; for( i = 1; i <= 8; i++ ) { H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] ); V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] ); } a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); b = ( 5 * H + 32 ) >> 6; c = ( 5 * V + 32 ) >> 6; i00 = a - b * 7 - c * 7 + 16; vect_sshort_u i00_u, b_u, c_u; i00_u.s[0] = i00; b_u.s[0] = b; c_u.s[0] = c; vec_u16_t val5_v = vec_splat_u16(5); vec_s16_t i00_v, b_v, c_v; i00_v = vec_splat(i00_u.v, 0); b_v = vec_splat(b_u.v, 0); c_v = vec_splat(c_u.v, 0); vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7); vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3)); vec_s32_t mule_b_v = vec_mule(induc_v, b_v); vec_s32_t mulo_b_v = vec_mulo(induc_v, b_v); vec_s16_t mul_b_induc0_v = vec_pack(vec_mergeh(mule_b_v, mulo_b_v), vec_mergel(mule_b_v, mulo_b_v)); vec_s16_t add_i0_b_0v = vec_adds(i00_v, mul_b_induc0_v); vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v); int y; for( y = 0; y < 16; y++ ) { vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v); vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v); vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v); vec_st( com_sat_v, 0, &src[0]); src += FDEC_STRIDE; i00 += c; add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); add_i0_b_8v = vec_adds(add_i0_b_8v, c_v); } }
static void vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } }
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) { int i; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { d0 = vec_ld(0, dst+i); d = float_to_int16_one_altivec(src+i); d1 = vec_ld(15, dst+i); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { d = float_to_int16_one_altivec(src+i); vec_st(d, 0, dst+i); } }
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len) { union { vector float v; float s[4]; } vadd; vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj; const vector unsigned char reverse = vcprm(3,2,1,0); int i,j; dst += len; win += len; src0+= len; vadd.s[0] = add_bias; vadd_bias = vec_splat(vadd.v, 0); zero = (vector float)vec_splat_u32(0); for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { s0 = vec_ld(i, src0); s1 = vec_ld(j, src1); wi = vec_ld(i, win); wj = vec_ld(j, win); s1 = vec_perm(s1, s1, reverse); wj = vec_perm(wj, wj, reverse); t0 = vec_madd(s0, wj, vadd_bias); t0 = vec_nmsub(s1, wi, t0); t1 = vec_madd(s0, wi, vadd_bias); t1 = vec_madd(s1, wj, t1); t1 = vec_perm(t1, t1, reverse); vec_st(t0, i, dst); vec_st(t1, j, dst); } }
float expf(float x) { #else float vexpf(float x) { #endif vector float vexpa, va;//, vx, vn, va, vb, // v0, vlog2, vln2; register float exp, a;//, b, b2, b3, b4;//, b6, b8, b10, R0, R; float __attribute__((aligned(16))) xa[4]; xa[0] = x; /* // set up a few constants vlog2 = vec_ld(0, &C_EXPF[0]); v0 = (vector float) vec_splat_u32(0); vln2 = vec_splat(vlog2, 1); vlog2 = vec_splat(vlog2, 0); // Load x into a vector float vx = vec_ld(0, xa); vx = vec_splat(vx, 0); // Split x = n*log2e + b vn = vec_madd(vx, vlog2e, v0); vn = vec_floor(vn);*/ xa[0] = truncf(x*M_LOG2E); va = vec_ld(0, xa); vexpa = vec_expte(va); a = xa[0] * M_LN2; vec_st(vexpa, 0, xa); /* b = x - a; b2 = b*b; b3 = b2*b; b4 = b2*b2; b6 = b4*b2; b8 = b6*b2; b10 = b8*b2; R0 = 0.1666666666666666019037 *b2 - 0.00277777777770155933842 *b4 + 6.61375632143793436117e-05 *b6 - 1.65339022054652515390e-06 *b8 + 4.13813679705723846039e-08 *b10; R = b - R0; //exp = 1.0 + 2.0*b/(2.0 - R); exp = (1680.0 + 840*b + 180*b2 + 20*b3 + b4)/(1680 - 840*b + 180*b2 - 20*b3 + b4); */ exp = xa[0]; return exp; }
void ff_vp3_idct_altivec(DCTELEM block[64]) { IDCT_START IDCT_1D(NOP, NOP) TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); IDCT_1D(ADD8, SHIFT4) vec_st(b0, 0x00, block); vec_st(b1, 0x10, block); vec_st(b2, 0x20, block); vec_st(b3, 0x30, block); vec_st(b4, 0x40, block); vec_st(b5, 0x50, block); vec_st(b6, 0x60, block); vec_st(b7, 0x70, block); }
/* ************************************************************************* * NAME: printVecFloats * * ************************************************************************* */ void printVecFloats(char *label, vector float outFloats, int arraySize) { float printfloat[arraySize] __attribute__ ((aligned (16))); int i; vec_st(outFloats, 0, printfloat); fprintf(stderr,"%s --> float:{",label); for (i = 0; i < arraySize; i++) { fprintf(stderr,"%f ",printfloat[i]); } fprintf(stderr,"}\n\n"); } /* printVecFloats */
/* ************************************************************************* * NAME: printVecShorts * * ************************************************************************* */ void printVecShorts(char *label, vector signed short outShorts, int arraySize) { signed short printshort[arraySize] __attribute__ ((aligned (16))); int i; vec_st(outShorts, 0, printshort); fprintf(stderr,"%s --> short:{",label); for (i = 0; i < arraySize; i++) { fprintf(stderr,"%d ",printshort[i]); } fprintf(stderr,"}\n\n"); } /* printVecShorts */
void CDSPToolsOSX::Add32(tfloat32* pDest, const tfloat32* pSrc) { #ifdef _Mac_PowerPC vector float vDest = vec_ldl(0, pDest); vector float vSrc = vec_ldl(0, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 0, pDest); vDest = vec_ldl(4 * 4, pDest); vSrc = vec_ldl(4 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 4 * 4, pDest); vDest = vec_ldl(8 * 4, pDest); vSrc = vec_ldl(8 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 8 * 4, pDest); vDest = vec_ldl(12 * 4, pDest); vSrc = vec_ldl(12 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 12 * 4, pDest); vDest = vec_ldl(16 * 4, pDest); vSrc = vec_ldl(16 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 16 * 4, pDest); vDest = vec_ldl(20 * 4, pDest); vSrc = vec_ldl(20 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 20 * 4, pDest); vDest = vec_ldl(24 * 4, pDest); vSrc = vec_ldl(24 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 24 * 4, pDest); vDest = vec_ldl(28 * 4, pDest); vSrc = vec_ldl(28 * 4, pSrc); vDest = vec_add(vDest, vSrc); vec_st(vDest, 28 * 4, pDest); #else // _Mac_PowerPC CDSPTools::Add32(pDest, pSrc); #endif // _Mac_PowerPC }
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w) { register int i; register vector unsigned char vdst, vsrc; /* dst and src are 16 bytes-aligned (guaranteed). */ for (i = 0; i + 15 < w; i += 16) { vdst = vec_ld(i, (unsigned char *) dst); vsrc = vec_ld(i, (unsigned char *) src); vdst = vec_add(vsrc, vdst); vec_st(vdst, i, (unsigned char *) dst); } /* If w is not a multiple of 16. */ for (; i < w; i++) dst[i] = src[i]; }
static av_always_inline void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int w, int is6tap) { LOAD_H_SUBPEL_FILTER(mx-1); vec_u8 align_vec0, align_vec8, permh0, permh8, filt; vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; vec_u8 a, b, pixh, pixl, outer; vec_s16 f16h, f16l; vec_s32 filth, filtl; vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); vec_u16 c7 = vec_splat_u16(7); align_vec0 = vec_lvsl( -is6tap-1, src); align_vec8 = vec_lvsl(8-is6tap-1, src); permh0 = vec_perm(align_vec0, align_vec0, perm_inner); permh8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_inner = vec_add(perm_inner, vec_splat_u8(4)); perml0 = vec_perm(align_vec0, align_vec0, perm_inner); perml8 = vec_perm(align_vec8, align_vec8, perm_inner); perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); while (h --> 0) { FILTER_H(f16h, 0); if (w == 16) { FILTER_H(f16l, 8); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); if (w == 8) vec_ste((vec_u32)filt, 4, (uint32_t*)dst); } src += src_stride; dst += dst_stride; } }
void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ) { vec_u16_t onev = vec_splat_u16(1); vec_u16_t twov = vec_add( onev, onev ); PREP_DIFF_8BYTEALIGNED; vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v; VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v ); DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ); vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v; VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v, dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v ); DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v ); vec_st( dct_tr0v, 0, dct ); vec_st( dct_tr1v, 16, dct ); vec_st( dct_tr2v, 32, dct ); vec_st( dct_tr3v, 48, dct ); vec_st( dct_tr4v, 64, dct ); vec_st( dct_tr5v, 80, dct ); vec_st( dct_tr6v, 96, dct ); vec_st( dct_tr7v, 112, dct ); }
void b() { z = vec_add (x, y); /* Make sure the predicates accept correct argument types. */ int1 = vec_all_in (f, g); int1 = vec_all_ge (f, g); int1 = vec_all_eq (c, d); int1 = vec_all_ne (s, t); int1 = vec_any_eq (i, j); int1 = vec_any_ge (f, g); int1 = vec_all_ngt (f, g); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); vec_mtvscr (i); vec_dssall (); s = (vector signed short) vec_mfvscr (); vec_dss (3); vec_dst (pi, int1 + int2, 3); vec_dstst (pi, int1 + int2, 3); vec_dststt (pi, int1 + int2, 3); vec_dstt (pi, int1 + int2, 3); uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi); uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi); c = vec_lde (int1, (signed char *) pi); s = vec_lde (int1, (signed short *) pi); i = vec_lde (int1, (signed int *) pi); i = vec_ldl (int1, pi); i = vec_ld (int1, pi); vec_st (i, int2, pi); vec_ste (c, int2, (signed char *) pi); vec_ste (s, int2, (signed short *) pi); vec_ste (i, int2, (signed int *) pi); vec_stl (i, int2, pi); }
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; register vector unsigned char perm = vec_lvsl(0, pixels); int i; for (i = 0; i < h; i++) { pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16,pixels); blockv = vec_ld(0, block); pixelsv = vec_perm(pixelsv1, pixelsv2, perm); blockv = vec_avg(blockv,pixelsv); vec_st(blockv, 0, (unsigned char*)block); pixels+=line_size; block +=line_size; } }
void foo( float scalar) { unsigned long width; unsigned long x; vector float vColor; vector unsigned int selectMask; vColor = vec_perm( vec_ld( 0, &scalar), vec_ld( 3, &scalar), vec_lvsl( 0, &scalar) ); float *destRow; vector float store, load0; for( ; x < width; x++) { load0 = vec_sel( vColor, load0, selectMask ); vec_st( store, 0, destRow ); store = load0; } }