static void FUNC(ff_hevc_idct_4x4, BIT_DEPTH)(int16_t *coeffs, int col_limit) { const int shift = 7; const int shift2 = 20 - BIT_DEPTH; vec_s16 src_01, src_23; vec_s32 res[4]; vec_s16 res_packed[2]; src_01 = vec_ld(0, coeffs); src_23 = vec_ld(16, coeffs); transform4x4(src_01, src_23, res, shift, coeffs); src_01 = vec_packs(res[0], res[1]); src_23 = vec_packs(res[2], res[3]); scale(res, res_packed, shift); // transpose src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]); src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]); transform4x4(src_01, src_23, res, shift2, coeffs); scale(res, res_packed, shift2); // transpose src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]); src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]); vec_st(src_01, 0, coeffs); vec_st(src_23, 16, coeffs); }
/* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; int i; for (i = 0; i < h; i++) { /* block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */ int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16, pixels); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); if (rightside) { pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); } else { pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); } blockv = vec_avg(blockv, pixelsv); vec_st(blockv, 0, block); pixels += line_size; block += line_size; } }
static inline void do_recursion(w128_t *r, w128_t *a, w128_t * b, w128_t * lung) { const vector unsigned char sl1 = ALTI_SL1; const vector unsigned char sl1_perm = ALTI_SL1_PERM; const vector unsigned int sl1_msk = ALTI_SL1_MSK; const vector unsigned char sr1 = ALTI_SR; const vector unsigned char sr1_perm = ALTI_SR_PERM; const vector unsigned int sr1_msk = ALTI_SR_MSK; const vector unsigned char perm = ALTI_PERM; const vector unsigned int msk1 = ALTI_MSK; vector unsigned int z = a->s; vector unsigned int w = lung->s; vector unsigned int x = vec_perm(w, (vector unsigned int)perm, perm); vector unsigned int y = vec_perm(z, (vector unsigned int)sl1_perm, sl1_perm); y = vec_sll(y, sl1); y = vec_and(y, sl1_msk); w = vec_xor(x, b->s); w = vec_xor(w, y); x = vec_perm(w, (vector unsigned int)sr1_perm, sr1_perm); x = vec_srl(x, sr1); x = vec_and(x, sr1_msk); y = vec_and(w, msk1); z = vec_xor(z, y); r->s = vec_xor(z, x); lung->s = w; }
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) { LOAD_ZERO; vec_s16 *pv1 = (vec_s16*)v1; vec_s16 *pv2 = (vec_s16*)v2; vec_s16 *pv3 = (vec_s16*)v3; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 t0, t1, i0, i1; register vec_s16 i2 = pv2[0], i3 = pv3[0]; register vec_s32 res = zero_s32v; register vec_u8 align = vec_lvsl(0, v2); int32_t ires; order >>= 4; do { t0 = vec_perm(i2, pv2[1], align); i2 = pv2[2]; t1 = vec_perm(pv2[1], i2, align); i0 = pv1[0]; i1 = pv1[1]; res = vec_msum(t0, i0, res); res = vec_msum(t1, i1, res); t0 = vec_perm(i3, pv3[1], align); i3 = pv3[2]; t1 = vec_perm(pv3[1], i3, align); pv1[0] = vec_mladd(t0, muls, i0); pv1[1] = vec_mladd(t1, muls, i1); pv1 += 2; pv2 += 2; pv3 += 2; } while(--order); res = vec_splat(vec_sums(res, zero_s32v), 3); vec_ste(res, 0, &ires); return ires; }
/* Store a vector to an unaligned location in memory */ static inline void StoreUnaligned (vector unsigned char v, const guchar *where) { if ((unsigned long)where & 0x0f) { /* Load the surrounding area */ vector unsigned char low = vec_ld(0, where); vector unsigned char high = vec_ld(16, where); /* Prepare the constants that we need */ vector unsigned char permuteVector = vec_lvsr(0, where); vector signed char oxFF = vec_splat_s8(-1); vector signed char ox00 = vec_splat_s8(0); /* Make a mask for which parts of the vectors to swap out */ vector unsigned char mask = (vector unsigned char)vec_perm(ox00, oxFF, permuteVector); v = vec_perm(v, v, permuteVector); /* Insert our data into the low and high vectors */ low = vec_sel(low, v, mask); high = vec_sel(v, high, mask); /* Store the two aligned result vectors */ vec_st(low, 0, CONST_BUFFER(where)); vec_st(high, 16, CONST_BUFFER(where)); } else { /* prevent overflow */ vec_st(v, 0, CONST_BUFFER(where)); } }
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) { int i; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst) & 15) //FIXME for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); d = float_to_int16_one_altivec(src + i); d1 = vec_ld(15, dst + i); d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst + i); vec_st(d1, 15, dst + i); } else for(i = 0; i < len - 7; i += 8) { d = float_to_int16_one_altivec(src + i); vec_st(d, 0, dst + i); } }
void float_to_int16_altivec(int16_t *dst, const float *src, int len) { int i; vector float s0, s1; vector signed int t0, t1; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); d0 = vec_ld(0, dst+i); t1 = vec_cts(s1, 0); d1 = vec_ld(15, dst+i); d = vec_packs(t0,t1); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { s0 = vec_ld(0, src+i); s1 = vec_ld(16, src+i); t0 = vec_cts(s0, 0); t1 = vec_cts(s1, 0); d = vec_packs(t0,t1); vec_st(d, 0, dst+i); } }
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len) { vector float zero, t0, t1, s0, s1, wi, wj; const vector unsigned char reverse = vcprm(3,2,1,0); int i,j; dst += len; win += len; src0+= len; zero = (vector float)vec_splat_u32(0); for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { s0 = vec_ld(i, src0); s1 = vec_ld(j, src1); wi = vec_ld(i, win); wj = vec_ld(j, win); s1 = vec_perm(s1, s1, reverse); wj = vec_perm(wj, wj, reverse); t0 = vec_madd(s0, wj, zero); t0 = vec_nmsub(s1, wi, t0); t1 = vec_madd(s0, wi, zero); t1 = vec_madd(s1, wj, t1); t1 = vec_perm(t1, t1, reverse); vec_st(t0, i, dst); vec_st(t1, j, dst); } }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector unsigned short al,ah; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
SIMD_INLINE void DeinterleavedUv(const Loader<align> & uv, Storer<align> & u, Storer<align> & v) { v128_u8 _uv0 = Load<align, first>(uv); v128_u8 _uv1 = Load<align, false>(uv); Store<align, first>(u, vec_perm(_uv0, _uv1, K8_PERM_U)); Store<align, first>(v, vec_perm(_uv0, _uv1, K8_PERM_V)); }
SIMD_INLINE void InterleavedUv(const Loader<align> & u, const Loader<align> & v, Storer<align> & uv) { v128_u8 _u = Load<align, first>(u); v128_u8 _v = Load<align, first>(v); Store<align, first>(uv, vec_perm(_u, _v, K8_PERM_UV0)); Store<align, false>(uv, vec_perm(_u, _v, K8_PERM_UV1)); }
void abcd2cbad_double( ILdouble *tdata, ILuint length, ILdouble *tnewdata ) { register ILubyte *data = (ILubyte*)tdata; register ILubyte *newdata = (ILubyte*)tnewdata; const vector unsigned char p = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F); register vector unsigned char d0,d1,d2,d3,t0,t1,t2,t3; length = eround16(length); if( length >= 4 ) { length -= 4; d3 = vec_ld(48,data); d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); while( length >= 4 ) { t0 = vec_perm(d0,d1,p); t1 = vec_perm(d1,d0,p); t2 = vec_perm(d2,d3,p); t3 = vec_perm(d3,d2,p); vec_st(t0,0,newdata); vec_st(t1,16,newdata); vec_st(t2,32,newdata); vec_st(t3,48,newdata); length -= 4; data += 16*4; newdata += 16*4; d3 = vec_ld(48,data); d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); } t0 = vec_perm(d0,d1,p); t1 = vec_perm(d1,d0,p); t2 = vec_perm(d2,d3,p); t3 = vec_perm(d3,d2,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); vec_st(d2,32,newdata); vec_st(d3,48,newdata); } if( length == 2 ) { d0 = vec_ld(0,data); d1 = vec_ld(16,data); t0 = vec_perm(d0,d1,p); t1 = vec_perm(d1,d0,p); vec_st(t0,0,newdata); vec_st(t1,16,newdata); } }
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, long len, int channels) { int i; vector signed short d0, d1, d2, c0, c1, t0, t1; vector unsigned char align; if(channels == 1) float_to_int16_altivec(dst, src[0], len); else if (channels == 2) { if(((long)dst) & 15) for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); t0 = float_to_int16_one_altivec(src[0] + i); d1 = vec_ld(31, dst + i); t1 = float_to_int16_one_altivec(src[1] + i); c0 = vec_mergeh(t0, t1); c1 = vec_mergel(t0, t1); d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d2, c0, align); d1 = vec_perm(c0, c1, align); vec_st(d0, 0, dst + i); d0 = vec_perm(c1, d2, align); vec_st(d1, 15, dst + i); vec_st(d0, 31, dst + i); dst += 8; } else for(i = 0; i < len - 7; i += 8) { t0 = float_to_int16_one_altivec(src[0] + i); t1 = float_to_int16_one_altivec(src[1] + i); d0 = vec_mergeh(t0, t1); d1 = vec_mergel(t0, t1); vec_st(d0, 0, dst + i); vec_st(d1, 16, dst + i); dst += 8; } } else { DECLARE_ALIGNED(16, int16_t, tmp)[len]; int c, j; for (c = 0; c < channels; c++) { float_to_int16_altivec(tmp, src[c], len); for (i = 0, j = c; i < len; i++, j += channels) { dst[j] = tmp[i]; } } } }
void jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data) { int outrow, outcol; JDIMENSION output_cols = width_blocks * DCTSIZE; JSAMPROW inptr, outptr; __vector unsigned char this0, next0, out; __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; /* Constants */ __vector unsigned short pw_bias = { __4X2(0, 1) }, pw_one = { __8X(1) }; __vector unsigned char even_odd_index = {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, pb_zero = { __16X(0) }; expand_right_edge(input_data, max_v_samp_factor, image_width, output_cols * 2); for (outrow = 0; outrow < v_samp_factor; outrow++) { outptr = output_data[outrow]; inptr = input_data[outrow]; for (outcol = output_cols; outcol > 0; outcol -= 16, inptr += 32, outptr += 16) { this0 = vec_ld(0, inptr); this0 = vec_perm(this0, this0, even_odd_index); this0e = (__vector unsigned short)VEC_UNPACKHU(this0); this0o = (__vector unsigned short)VEC_UNPACKLU(this0); outl = vec_add(this0e, this0o); outl = vec_add(outl, pw_bias); outl = vec_sr(outl, pw_one); if (outcol > 8) { next0 = vec_ld(16, inptr); next0 = vec_perm(next0, next0, even_odd_index); next0e = (__vector unsigned short)VEC_UNPACKHU(next0); next0o = (__vector unsigned short)VEC_UNPACKLU(next0); outh = vec_add(next0e, next0o); outh = vec_add(outh, pw_bias); outh = vec_sr(outh, pw_one); } else outh = vec_splat_u16(0); out = vec_pack(outl, outh); vec_st(out, 0, outptr); } } }
void v_store_interleave_f32(float *ptr, vector float a, vector float b, vector float c) { vector float hbc = vec_mergeh(b, c); static const vector unsigned char ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; vec_xst(vec_perm(a, hbc, ahbc), 0, ptr); vector float lab = vec_mergel(a, b); vec_xst(vec_sld(lab, hbc, 8), 16, ptr); static const vector unsigned char clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15}; vec_xst(vec_perm(c, lab, clab), 32, ptr); }
/* Store a float vector to a potentially unaligned address */ void store_unaligned(float *target, vector float src) { vector float msq, lsq, edges; vector unsigned char edgeAlign, align; msq = vec_ld(0, target); // most significant quadword lsq = vec_ld(15, target); // least significant quadword edgeAlign = vec_lvsl(0, target); // permute map to extract edges edges = vec_perm(lsq, msq, edgeAlign); // extract the edges align = vec_lvsr(0, target); // permute map to misalign data msq = vec_perm(edges, src, align); // misalign the data (msq) lsq = vec_perm(src, edges, align); // misalign the data (lsq) vec_st(lsq, 15, target); // Store the lsq part first vec_st(msq, 0, target); // Store the msq part }
SIMD_INLINE void Bgr48pToBgra32(const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const v128_u8 & alpha, Storer<align> & bgra) { const v128_u8 _blue = Load<align>(blue + offset); const v128_u8 _green = Load<align>(green + offset); const v128_u8 _red = Load<align>(red + offset); v128_u16 bg = (v128_u16)vec_perm(_blue, _green, K8_PERM_48); v128_u16 ra = (v128_u16)vec_perm(_red, alpha, K8_PERM_48); Store<align, first>(bgra, (v128_u8)UnpackLoU16(ra, bg)); Store<align, false>(bgra, (v128_u8)UnpackHiU16(ra, bg)); }
static void unaligned_store(vector signed short value, int off, int16_t *dst) { register vector unsigned char align = vec_lvsr(0, dst), mask = vec_lvsl(0, dst); register vector signed short t0,t1, edges; t0 = vec_ld(0+off, dst); t1 = vec_ld(15+off, dst); edges = vec_perm(t1 ,t0, mask); t1 = vec_perm(value, edges, align); t0 = vec_perm(edges, value, align); vec_st(t1, 15+off, dst); vec_st(t0, 0+off, dst); }
void v_load_deinterleave_f32(float *ptr, vector float* a, vector float* b, vector float* c) { vector float v1 = vec_xl( 0, ptr); vector float v2 = vec_xl(16, ptr); vector float v3 = vec_xl(32, ptr); static const vector unsigned char flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; *a = vec_perm(v1, vec_sld(v3, v2, 8), flp); static const vector unsigned char flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; *b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); *c = vec_perm(vec_sld(v2, v1, 8), v3, flp); }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; vec_s32 v_dc32; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; v_dc32 = vec_lde(0, &dc); dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1); if (size == 4) dc16 = VEC_SLD16(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); #if !HAVE_BIGENDIAN aligner = vec_perm(aligner, zero_u8v, vcswapc()); #endif dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
static inline void abcd2cbad_internal( register const vector unsigned char p, unsigned char *data, unsigned int length, unsigned char *newdata ) { register vector unsigned char d0,d1,d2,z; z = vec_splat_u8(0); length = eround16(length); if( length >= 3 ) { length -= 3; d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); while( length >= 3 ) { d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); d2 = vec_perm(d2,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); vec_st(d2,32,newdata); length -= 3; data += 16*3; newdata += 16*3; d2 = vec_ld(32,data); d1 = vec_ld(16,data); d0 = vec_ld(0,data); } d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); d2 = vec_perm(d2,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); vec_st(d2,32,newdata); } if( length == 2 ) { d0 = vec_ld(0,data); d1 = vec_ld(16,data); d0 = vec_perm(d0,z,p); d1 = vec_perm(d1,z,p); vec_st(d0,0,newdata); vec_st(d1,16,newdata); } else if( length == 1 ) { d0 = vec_ld(0,data); d0 = vec_perm(d0,d0,z); vec_st(d0,0,newdata); } }
SIMD_INLINE void BgrToGray(const Loader<align> & bgr, Storer<align> & gray) { v128_u8 _bgr[3]; _bgr[0] = Load<align, first>(bgr); _bgr[1] = Load<align, false>(bgr); _bgr[2] = Load<align, false>(bgr); const v128_u16 lo = vec_packsu( BgraToGray32(vec_perm(_bgr[0], _bgr[1], K8_PERM_0)), BgraToGray32(vec_perm(_bgr[0], _bgr[1], K8_PERM_1))); const v128_u16 hi = vec_packsu( BgraToGray32(vec_perm(_bgr[1], _bgr[2], K8_PERM_2)), BgraToGray32(vec_perm(_bgr[1], _bgr[2], K8_PERM_3))); Store<align, first>(gray, vec_packsu(lo, hi)); }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i, s = 0; const vector unsigned int zero = (const vector unsigned int) vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); vector signed int sum; for (i = 0; i < 16; i++) { /* Read the potentially unaligned pixels. */ vector unsigned char pixl = vec_ld(0, pix); vector unsigned char pixr = vec_ld(15, pix); vector unsigned char pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum. */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s. */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
static void _twin_fbdev_vec_put_span (twin_coord_t left, twin_coord_t top, twin_coord_t right, twin_argb32_t *pixels, void *closure) { twin_fbdev_t *tf = closure; twin_coord_t width = right - left; unsigned int *dest; vector unsigned char edgeperm; vector unsigned char src0v, src1v, srcv; if (!tf->active || tf->fb_base == MAP_FAILED) return; dest = (unsigned int *)(tf->fb_ptr + top * tf->fb_fix.line_length); dest += left; while((((unsigned long)dest) & 0xf) && width--) *(dest++) = *(pixels++); edgeperm = vec_lvsl (0, pixels); src0v = vec_ld (0, pixels); while(width >= 4) { src1v = vec_ld (16, pixels); srcv = vec_perm (src0v, src1v, edgeperm); vec_st ((vector unsigned int)srcv, 0, dest); src0v = src1v; dest += 4; pixels += 4; width -= 4; } while(width--) *(dest++) = *(pixels++); }
int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ tv = (vector unsigned char *) pix; pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ) { vec_s16_t diff[2]; vec_s32_t sum[2]; vec_s32_t zero32 = vec_splat_s32(0); vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; pix_diff( &pix1[0], &pix2[0], diff, 0 ); pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 ); sum[0] = vec_sum4s( diff[0], zero32 ); sum[1] = vec_sum4s( diff[1], zero32 ); diff[0] = vec_packs( sum[0], sum[1] ); sum[0] = vec_sum4s( diff[0], zero32 ); diff[0] = vec_packs( sum[0], zero32 ); diff[1] = vec_vsx_ld( 0, dct ); diff[0] = vec_perm( diff[0], diff[1], mask ); vec_vsx_st( diff[0], 0, dct ); /* 2x2 DC transform */ int d0 = dct[0] + dct[1]; int d1 = dct[2] + dct[3]; int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; dct[0] = d0 + d1; dct[1] = d0 - d1; dct[2] = d2 + d3; dct[3] = d2 - d3; }
static int pix_sum_altivec(uint8_t * pix, int line_size) { const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; int s; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); t1 = vec_perm(pixl, pixr, perm); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t1, sad); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }