static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW, int big_endian, int output_bits) { const int dst_u = -(uintptr_t)dest & 7; const int shift = 3; const int add = (1 << (shift - 1)); const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); vector uint32_t v, v2; vector uint16_t vd; int i; yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0); for (i = dst_u; i < dstW - 7; i += 8) { v = vec_vsx_ld(0, (const uint32_t *) &src[i]); v = vec_add(v, vadd); v = vec_sr(v, vshift); v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]); v2 = vec_add(v2, vadd); v2 = vec_sr(v2, vshift); vd = vec_packsu(v, v2); vd = vec_rl(vd, vswap); vec_st(vd, 0, &dest[i]); } yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); }
void imageFilterMean_Altivec(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { MEAN_PIXEL(); --n; ++dst; ++src1; ++src2; } // Do bulk of processing using Altivec (find the mean of 16 8-bit unsigned integers, with saturation) vector unsigned char rshft = vec_splat_u8(0x1); while(n >= 16) { vector unsigned char s1 = vec_ld(0,src1); s1 = vec_sr(s1, rshft); // shift right 1 vector unsigned char s2 = vec_ld(0,src2); s2 = vec_sr(s2, rshft); // shift right 1 vector unsigned char r = vec_adds(s1, s2); vec_st(r,0,dst); n -= 16; src1 += 16; src2 += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_MEAN(); }
void gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector unsigned short al,ah; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
void jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, JDIMENSION width_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data) { int outrow, outcol; JDIMENSION output_cols = width_blocks * DCTSIZE; JSAMPROW inptr, outptr; __vector unsigned char this0, next0, out; __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; /* Constants */ __vector unsigned short pw_bias = { __4X2(0, 1) }, pw_one = { __8X(1) }; __vector unsigned char even_odd_index = {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15}, pb_zero = { __16X(0) }; expand_right_edge(input_data, max_v_samp_factor, image_width, output_cols * 2); for (outrow = 0; outrow < v_samp_factor; outrow++) { outptr = output_data[outrow]; inptr = input_data[outrow]; for (outcol = output_cols; outcol > 0; outcol -= 16, inptr += 32, outptr += 16) { this0 = vec_ld(0, inptr); this0 = vec_perm(this0, this0, even_odd_index); this0e = (__vector unsigned short)VEC_UNPACKHU(this0); this0o = (__vector unsigned short)VEC_UNPACKLU(this0); outl = vec_add(this0e, this0o); outl = vec_add(outl, pw_bias); outl = vec_sr(outl, pw_one); if (outcol > 8) { next0 = vec_ld(16, inptr); next0 = vec_perm(next0, next0, even_odd_index); next0e = (__vector unsigned short)VEC_UNPACKHU(next0); next0o = (__vector unsigned short)VEC_UNPACKLU(next0); outh = vec_add(next0e, next0o); outh = vec_add(outh, pw_bias); outh = vec_sr(outh, pw_one); } else outh = vec_splat_u16(0); out = vec_pack(outl, outh); vec_st(out, 0, outptr); } } }
static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, int big_endian, int output_bits) { const int dst_u = -(uintptr_t)dest & 7; const int shift = 15 - output_bits; const int add = (1 << (shift - 1)); const int clip = (1 << output_bits) - 1; const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add}; const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift); const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip}; vector uint16_t v; int i; yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0); for (i = dst_u; i < dstW - 7; i += 8) { v = vec_vsx_ld(0, (const uint16_t *) &src[i]); v = vec_add(v, vadd); v = vec_sr(v, vshift); v = vec_min(v, vlargest); v = vec_rl(v, vswap); vec_st(v, 0, &dest[i]); } yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); }
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
SIMD_INLINE v128_u16 Interpolate(v128_u16 s[2][2], v128_u16 k[2][2]) { v128_u16 r = vec_mladd(s[0][0], k[0][0], K16_BILINEAR_ROUND_TERM); r = vec_mladd(s[0][1], k[0][1], r); r = vec_mladd(s[1][0], k[1][0], r); r = vec_mladd(s[1][1], k[1][1], r); return vec_sr(r, K16_BILINEAR_SHIFT); }
SIMD_INLINE v128_u32 BgraToGray32(v128_u8 bgra) { const v128_u16 _b_r = vec_mule(bgra, K8_01); const v128_u16 _g_a = vec_mulo(bgra, K8_01); const v128_u32 weightedSum = vec_add(vec_mule(_g_a, K16_GREEN_0000), vec_add(vec_mule(_b_r, K16_BLUE_RED), vec_mulo(_b_r, K16_BLUE_RED))); return vec_sr(vec_add(weightedSum, K32_ROUND_TERM), K32_SHIFT); }
uint32_t quant_h263_inter_altivec_c(int16_t *coeff, int16_t *data, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector unsigned char zerovec; vector unsigned short mult; vector unsigned short quant_m_2; vector unsigned short quant_d_2; vector unsigned short sum_short; vector signed short acLevel; vector unsigned int even; vector unsigned int odd; vector bool short m2_mask; vector bool short zero_mask; uint32_t result; #ifdef DEBUG if(((unsigned)coeff) & 0x15) fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff); #endif /* initialisation stuff */ zerovec = vec_splat_u8(0); *((unsigned short*)&mult) = (unsigned short)multipliers[quant]; mult = vec_splat(mult, 0); *((unsigned short*)&quant_m_2) = (unsigned short)quant; quant_m_2 = vec_splat(quant_m_2, 0); quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1)); *((unsigned short*)&quant_d_2) = (unsigned short)quant; quant_d_2 = vec_splat(quant_d_2, 0); quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1)); sum_short = (vector unsigned short)zerovec; /* Quantize */ QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); QUANT_H263_INTER_ALTIVEC(); /* Calculate the return value */ even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec); even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec); even = vec_splat(even, 3); vec_ste(even, 0, &result); return result; }
static force_inline vector unsigned int pix_multiply (vector unsigned int p, vector unsigned int a) { vector unsigned short hi, lo, mod; /* unpack to short */ hi = (vector unsigned short) vec_mergeh ((vector unsigned char)AVV (0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergeh ((vector unsigned char)AVV (0), (vector unsigned char)a); hi = vec_mladd (hi, mod, (vector unsigned short) AVV (0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080)); hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); hi = vec_sr (hi, vec_splat_u16 (8)); /* unpack to short */ lo = (vector unsigned short) vec_mergel ((vector unsigned char)AVV (0), (vector unsigned char)p); mod = (vector unsigned short) vec_mergel ((vector unsigned char)AVV (0), (vector unsigned char)a); lo = vec_mladd (lo, mod, (vector unsigned short) AVV (0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080)); lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); lo = vec_sr (lo, vec_splat_u16 (8)); return (vector unsigned int)vec_packsu (hi, lo); }
/* this code assume that stride % 16 == 0 */ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { signed int ABCD[4] __attribute__((aligned(16))) = {((8 - x) * (8 - y)), ((x) * (8 - y)), ((8 - x) * (y)), ((x) * (y))}; register int i; vector unsigned char fperm; const vector signed int vABCD = vec_ld(0, ABCD); const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vector signed int vzero = vec_splat_s32(0); const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); const vector unsigned short v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vector unsigned char vsrc0uc, vsrc1uc; vector signed short vsrc0ssH, vsrc1ssH; vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; vector signed short vsrc2ssH, vsrc3ssH, psum; vector unsigned char vdst, ppsum, fsum; if (((unsigned long)dst) % 16 == 0) { fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v28ss, psum); psum = vec_sra(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vector unsigned char)vec_packsu(psum, psum); fsum = vec_perm(vdst, ppsum, fperm); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { vector unsigned char vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v28ss, psum); psum = vec_sr(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vector unsigned char)vec_pack(psum, psum); fsum = vec_perm(vdst, ppsum, fperm); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } }
static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; uint8_t *pix3 = pix2 + line_size; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); vector unsigned char avgv, t5; vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; vector unsigned short avghv, avglv; vector unsigned short t1, t2, t3, t4; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); s = 0; /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, as well as some splitting, and vector addition each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] Split the pixel vectors into shorts */ pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); t1 = vec_add(pix2hv, pix2ihv); t2 = vec_add(pix2lv, pix2ilv); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ pix1v = vec_ld(0, pix1); pix2l = vec_ld( 0, pix3); pix2r = vec_ld(16, pix3); pix3v = vec_perm(pix2l, pix2r, perm1); pix3iv = vec_perm(pix2l, pix2r, perm2); /* Note that AltiVec does have vec_avg, but this works on vector pairs and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. Instead, we have to split the pixel vectors into vectors of shorts, and do the averaging by hand. */ /* Split the pixel vectors into shorts */ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); /* Do the averaging on them */ t3 = vec_add(pix3hv, pix3ihv); t4 = vec_add(pix3lv, pix3ilv); avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); /* Pack the shorts back into a result */ avgv = vec_pack(avghv, avglv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix3 += line_size; /* Transfer the calculated values for pix3 into pix2 */ t1 = t3; t2 = t4; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
SIMD_INLINE v128_u16 DivideBy16(v128_u16 value) { return vec_sr(vec_add(value, K16_0008), K16_0004); }
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = {rounder, rounder, rounder, rounder, rounder, rounder, rounder, rounder}; const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = { (16-x16)*(16-y16), /* A */ ( x16)*(16-y16), /* B */ (16-x16)*( y16), /* C */ ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; int i; unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); Cv = vec_splat(tempA, 2); Dv = vec_splat(tempA, 3); rounderV = vec_ld(0, (unsigned short*)rounder_a); // we'll be able to pick-up our 9 char elements // at src from those 32 bytes // we load the first batch here, as inside the loop // we can re-use 'src+stride' from one iteration // as the 'src' of the next. src_0 = vec_ld(0, src); src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); } else { srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for(i=0; i<h; i++) { dst_odd = (unsigned long)dst & 0x0000000F; src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); // we we'll be able to pick-up our 9 char elements // at src + stride from those 32 bytes // then reuse the resulting 2 vectors srvcC and srcvD // as the next srcvA and srcvB src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); } else { srcvD = src_1; } srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); // OK, now we (finally) do the math :-) // those four instructions replaces 32 int muls & 32 int adds. // isn't AltiVec nice ? tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short)vczero); if (dst_odd) { dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); } else { dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); } vec_st(dstv2, 0, dst); dst += stride; src += stride; } POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); }
/* this code assume that stride % 16 == 0 */ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); signed int ABCD[4] __attribute__((aligned(16))); register int i; ABCD[0] = ((8 - x) * (8 - y)); ABCD[1] = ((x) * (8 - y)); ABCD[2] = ((8 - x) * (y)); ABCD[3] = ((x) * (y)); const vector signed int vABCD = vec_ld(0, ABCD); const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vector signed int vzero = vec_splat_s32(0); const vector signed short v32ss = (const vector signed short)AVV(32); const vector unsigned short v6us = vec_splat_u16(6); vector unsigned char fperm; if (((unsigned long)dst) % 16 == 0) { fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vector unsigned char vsrcAuc; vector unsigned char vsrcBuc; vector unsigned char vsrcperm0; vector unsigned char vsrcperm1; vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vector unsigned char vsrc0uc; vector unsigned char vsrc1uc; vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vector unsigned char vsrcCuc; vsrcCuc = vec_ld(stride + 0, src); vector unsigned char vsrc2uc; vector unsigned char vsrc3uc; vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); vector signed short psum; psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sra(psum, v6us); vector unsigned char vdst = vec_ld(0, dst); vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum); vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { for (i = 0 ; i < h ; i++) { vector unsigned char vsrcCuc; vector unsigned char vsrcDuc; vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vector unsigned char vsrc2uc; vector unsigned char vsrc3uc; vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); vector signed short psum; psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sr(psum, v6us); vector unsigned char vdst = vec_ld(0, dst); vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum); vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); }
/* this code assume that stride % 16 == 0 */ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); DECLARE_ALIGNED_16(signed int, ABCD[4]) = {((8 - x) * (8 - y)), ((x) * (8 - y)), ((8 - x) * (y)), ((x) * (y))}; register int i; vec_u8_t fperm; const vec_s32_t vABCD = vec_ld(0, ABCD); const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); LOAD_ZERO; const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_u16_t v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8_t vsrc0uc, vsrc1uc; vec_s16_t vsrc0ssH, vsrc1ssH; vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; vec_s16_t vsrc2ssH, vsrc3ssH, psum; vec_u8_t vdst, ppsum, vfdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sra(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vec_u8_t)vec_packsu(psum, psum); vfdst = vec_perm(vdst, ppsum, fperm); OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { vec_u8_t vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sr(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vec_u8_t)vec_pack(psum, psum); vfdst = vec_perm(vdst, ppsum, fperm); OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); }
SIMD_INLINE v128_s16 Average(const v128_u8 & s0, const v128_u8 & s1) { return (v128_s16)vec_sr(vec_add(vec_add( vec_add(vec_mule(s0, K8_01), vec_mulo(s0, K8_01)), vec_add(vec_mule(s1, K8_01), vec_mulo(s1, K8_01))), K16_0002), K16_0002); }
SIMD_INLINE v128_s16 Average(const v128_u8 & s) { return (v128_s16)vec_sr(vec_add(vec_add(vec_mule(s, K8_01), vec_mulo(s, K8_01)), K16_0001), K16_0001); }
void gimp_composite_blend_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; guchar blend = ctx->blend.blend; union { vector unsigned char v; unsigned char u8[16]; } vblend; vector unsigned char vblendc; vector unsigned char a,b,d; vector unsigned short al,ah,bl,bh,one=vec_splat_u16(1); guchar tmp; for (tmp=0; tmp<16; tmp++ ) vblend.u8[tmp]=blend; vblendc=vec_nor(vblend.v,vblend.v); while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); /* dest[b] = (src1[b] * blend2 + src2[b] * blend) / 255; * to divide by 255 we use ((n+1)+(n+1)>>8)>>8 * It works for all value but 0xffff * happily blending formula can't give this value */ al=vec_mule(a,vblendc); ah=vec_mulo(a,vblendc); bl=vec_mule(b,vblend.v); bh=vec_mulo(b,vblend.v); al=vec_add(al,bl); al=vec_add(al,one); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,bh); ah=vec_add(ah,one); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,vblendc); ah=vec_mulo(a,vblendc); bl=vec_mule(b,vblend.v); bh=vec_mulo(b,vblend.v); al=vec_add(al,bl); al=vec_add(al,one); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,bh); ah=vec_add(ah,one); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); StoreUnalignedLess(d, D, length); }
vector unsigned short testsr_unsigned (vector unsigned short x, vector unsigned short y) { return vec_sr (x, y); }
// CHECK-LABEL: define void @test1 void test1() { /* vec_cmpeq */ res_vbll = vec_cmpeq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd // CHECK-LE: @llvm.ppc.altivec.vcmpequd // CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous res_vbll = vec_cmpeq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd // CHECK-LE: @llvm.ppc.altivec.vcmpequd // CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous /* vec_cmpgt */ res_vbll = vec_cmpgt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd // CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous res_vbll = vec_cmpgt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud // CHECK-LE: @llvm.ppc.altivec.vcmpgtud // CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous /* ----------------------- predicates --------------------------- */ /* vec_all_eq */ res_i = vec_all_eq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous /* vec_all_ne */ res_i = vec_all_ne(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous /* vec_any_eq */ res_i = vec_any_eq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous /* vec_any_ne */ res_i = vec_any_ne(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous /* vec_all_ge */ res_i = vec_all_ge(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous /* vec_all_gt */ res_i = vec_all_gt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous /* vec_all_le */ res_i = vec_all_le(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous /* vec_all_lt */ res_i = vec_all_lt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous /* vec_any_ge */ res_i = vec_any_ge(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous /* vec_any_gt */ res_i = vec_any_gt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous /* vec_any_le */ res_i = vec_any_le(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous /* vec_any_lt */ res_i = vec_any_lt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous /* vec_max */ res_vsll = vec_max(vsll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vsll = vec_max(vbll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vsll = vec_max(vsll, vbll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vull, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vbll, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vull, vbll); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous /* vec_min */ res_vsll = vec_min(vsll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vsll = vec_min(vbll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vsll = vec_min(vsll, vbll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vull, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vbll, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vull, vbll); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous /* vec_mule */ res_vsll = vec_mule(vi, vi); // CHECK: @llvm.ppc.altivec.vmulesw // CHECK-LE: @llvm.ppc.altivec.vmulosw // CHECK-PPC: error: call to 'vec_mule' is ambiguous res_vull = vec_mule(vui , vui); // CHECK: @llvm.ppc.altivec.vmuleuw // CHECK-LE: @llvm.ppc.altivec.vmulouw // CHECK-PPC: error: call to 'vec_mule' is ambiguous /* vec_mulo */ res_vsll = vec_mulo(vi, vi); // CHECK: @llvm.ppc.altivec.vmulosw // CHECK-LE: @llvm.ppc.altivec.vmulesw // CHECK-PPC: error: call to 'vec_mulo' is ambiguous res_vull = vec_mulo(vui, vui); // CHECK: @llvm.ppc.altivec.vmulouw // CHECK-LE: @llvm.ppc.altivec.vmuleuw // CHECK-PPC: error: call to 'vec_mulo' is ambiguous /* vec_packs */ res_vi = vec_packs(vsll, vsll); // CHECK: @llvm.ppc.altivec.vpksdss // CHECK-LE: @llvm.ppc.altivec.vpksdss // CHECK-PPC: error: call to 'vec_packs' is ambiguous res_vui = vec_packs(vull, vull); // CHECK: @llvm.ppc.altivec.vpkudus // CHECK-LE: @llvm.ppc.altivec.vpkudus // CHECK-PPC: error: call to 'vec_packs' is ambiguous /* vec_packsu */ res_vui = vec_packsu(vsll, vsll); // CHECK: @llvm.ppc.altivec.vpksdus // CHECK-LE: @llvm.ppc.altivec.vpksdus // CHECK-PPC: error: call to 'vec_packsu' is ambiguous res_vui = vec_packsu(vull, vull); // CHECK: @llvm.ppc.altivec.vpkudus // CHECK-LE: @llvm.ppc.altivec.vpkudus // CHECK-PPC: error: call to 'vec_packsu' is ambiguous /* vec_rl */ res_vsll = vec_rl(vsll, vull); // CHECK: @llvm.ppc.altivec.vrld // CHECK-LE: @llvm.ppc.altivec.vrld // CHECK-PPC: error: call to 'vec_rl' is ambiguous res_vull = vec_rl(vull, vull); // CHECK: @llvm.ppc.altivec.vrld // CHECK-LE: @llvm.ppc.altivec.vrld // CHECK-PPC: error: call to 'vec_rl' is ambiguous /* vec_sl */ res_vsll = vec_sl(vsll, vull); // CHECK: shl <2 x i64> // CHECK-LE: shl <2 x i64> // CHECK-PPC: error: call to 'vec_sl' is ambiguous res_vull = vec_sl(vull, vull); // CHECK: shl <2 x i64> // CHECK-LE: shl <2 x i64> // CHECK-PPC: error: call to 'vec_sl' is ambiguous /* vec_sr */ res_vsll = vec_sr(vsll, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sr' is ambiguous res_vull = vec_sr(vull, vull); // CHECK: lshr <2 x i64> // CHECK-LE: lshr <2 x i64> // CHECK-PPC: error: call to 'vec_sr' is ambiguous /* vec_sra */ res_vsll = vec_sra(vsll, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sra' is ambiguous res_vull = vec_sra(vull, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sra' is ambiguous /* vec_unpackh */ res_vsll = vec_unpackh(vi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: error: call to 'vec_unpackh' is ambiguous res_vbll = vec_unpackh(vbi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: error: call to 'vec_unpackh' is ambiguous /* vec_unpackl */ res_vsll = vec_unpackl(vi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: error: call to 'vec_unpackl' is ambiguous res_vbll = vec_unpackl(vbi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: error: call to 'vec_unpackl' is ambiguous /* vec_vpksdss */ res_vi = vec_vpksdss(vsll, vsll); // CHECK: llvm.ppc.altivec.vpksdss // CHECK-LE: llvm.ppc.altivec.vpksdss // CHECK-PPC: warning: implicit declaration of function 'vec_vpksdss' /* vec_vpksdus */ res_vui = vec_vpksdus(vsll, vsll); // CHECK: llvm.ppc.altivec.vpksdus // CHECK-LE: llvm.ppc.altivec.vpksdus // CHECK-PPC: warning: implicit declaration of function 'vec_vpksdus' /* vec_vpkudum */ res_vi = vec_vpkudum(vsll, vsll); // CHECK: vperm // CHECK-LE: vperm // CHECK-PPC: warning: implicit declaration of function 'vec_vpkudum' res_vui = vec_vpkudum(vull, vull); // CHECK: vperm // CHECK-LE: vperm res_vui = vec_vpkudus(vull, vull); // CHECK: llvm.ppc.altivec.vpkudus // CHECK-LE: llvm.ppc.altivec.vpkudus // CHECK-PPC: warning: implicit declaration of function 'vec_vpkudus' /* vec_vupkhsw */ res_vsll = vec_vupkhsw(vi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: warning: implicit declaration of function 'vec_vupkhsw' res_vbll = vec_vupkhsw(vbi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw /* vec_vupklsw */ res_vsll = vec_vupklsw(vi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: warning: implicit declaration of function 'vec_vupklsw' res_vbll = vec_vupklsw(vbi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw /* vec_max */ res_vsll = vec_max(vsll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vsll = vec_max(vbll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vsll = vec_max(vsll, vbll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vull = vec_max(vull, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud res_vull = vec_max(vbll, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud /* vec_min */ res_vsll = vec_min(vsll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vsll = vec_min(vbll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vsll = vec_min(vsll, vbll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vull = vec_min(vull, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud res_vull = vec_min(vbll, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud }
static inline vec_uint4 vec_SHR(unsigned int n, vec_uint4 x) { return vec_sr(x, vec_splat_u32(n)); }
void x264_zigzag_interleave_8x8_cavlc_altivec( int16_t *dst, int16_t *src, uint8_t *nnz ) { vec_s16_t tmpv[8]; vec_s16_t merge[2]; vec_s16_t permv[2]; vec_s16_t orv[4]; vec_s16_t src0v = vec_ld( 0*16, src ); vec_s16_t src1v = vec_ld( 1*16, src ); vec_s16_t src2v = vec_ld( 2*16, src ); vec_s16_t src3v = vec_ld( 3*16, src ); vec_s16_t src4v = vec_ld( 4*16, src ); vec_s16_t src5v = vec_ld( 5*16, src ); vec_s16_t src6v = vec_ld( 6*16, src ); vec_s16_t src7v = vec_ld( 7*16, src ); vec_u8_t pack; vec_u8_t nnzv = vec_vsx_ld( 0, nnz ); vec_u8_t shift = vec_splat_u8( 7 ); LOAD_ZERO; const vec_u8_t mask[3] = { { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }, { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }, { 0x10, 0x11, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x12, 0x13, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F } }; tmpv[0] = vec_mergeh( src0v, src1v ); tmpv[1] = vec_mergel( src0v, src1v ); tmpv[2] = vec_mergeh( src2v, src3v ); tmpv[3] = vec_mergel( src2v, src3v ); tmpv[4] = vec_mergeh( src4v, src5v ); tmpv[5] = vec_mergel( src4v, src5v ); tmpv[6] = vec_mergeh( src6v, src7v ); tmpv[7] = vec_mergel( src6v, src7v ); merge[0] = vec_mergeh( tmpv[0], tmpv[1] ); merge[1] = vec_mergeh( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 0*16, dst ); merge[0] = vec_mergeh( tmpv[4], tmpv[5] ); merge[1] = vec_mergeh( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 1*16, dst ); vec_st( permv[1], 2*16, dst ); vec_st( permv[2], 3*16, dst ); merge[0] = vec_mergel( tmpv[0], tmpv[1] ); merge[1] = vec_mergel( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 4*16, dst ); merge[0] = vec_mergel( tmpv[4], tmpv[5] ); merge[1] = vec_mergel( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 5*16, dst ); vec_st( permv[1], 6*16, dst ); vec_st( permv[2], 7*16, dst ); orv[0] = vec_or( src0v, src1v ); orv[1] = vec_or( src2v, src3v ); orv[2] = vec_or( src4v, src5v ); orv[3] = vec_or( src6v, src7v ); permv[0] = vec_or( orv[0], orv[1] ); permv[1] = vec_or( orv[2], orv[3] ); permv[0] = vec_or( permv[0], permv[1] ); permv[1] = vec_perm( permv[0], permv[0], mask[1] ); permv[0] = vec_or( permv[0], permv[1] ); pack = (vec_u8_t)vec_packs( permv[0], permv[0] ); pack = (vec_u8_t)vec_cmpeq( pack, zerov ); pack = vec_nor( pack, zerov ); pack = vec_sr( pack, shift ); nnzv = vec_perm( nnzv, pack, mask[2] ); vec_st( nnzv, 0, nnz ); }
template <> SIMD_INLINE v128_u16 DivideBy16<false>(v128_u16 value) { return vec_sr(value, K16_0004); }
template <> SIMD_INLINE v128_u16 DivideBy16<true>(v128_u16 value) { return vec_sr(vec_add(value, K16_0008), K16_0004); }
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8 * to preserve proper dst alignment. */ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { int i; const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = { (16 - x16) * (16 - y16), /* A */ (x16) * (16 - y16), /* B */ (16 - x16) * (y16), /* C */ (x16) * (y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const vector unsigned char vczero = (const vector unsigned char) vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short) vec_splat_u16(8); register vector unsigned char dstv, dstv2, srcvB, srcvC, srcvD; register vector unsigned short tempB, tempC, tempD; unsigned long dst_odd = (unsigned long) dst & 0x0000000F; unsigned long src_really_odd = (unsigned long) src & 0x0000000F; register vector unsigned short tempA = vec_ld(0, (const unsigned short *) ABCD); register vector unsigned short Av = vec_splat(tempA, 0); register vector unsigned short Bv = vec_splat(tempA, 1); register vector unsigned short Cv = vec_splat(tempA, 2); register vector unsigned short Dv = vec_splat(tempA, 3); register vector unsigned short rounderV = vec_splat((vec_u16) vec_lde(0, &rounder_a), 0); /* we'll be able to pick-up our 9 char elements at src from those * 32 bytes we load the first batch here, as inside the loop we can * reuse 'src + stride' from one iteration as the 'src' of the next. */ register vector unsigned char src_0 = vec_ld(0, src); register vector unsigned char src_1 = vec_ld(16, src); register vector unsigned char srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); else srcvB = src_1; srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for (i = 0; i < h; i++) { dst_odd = (unsigned long) dst & 0x0000000F; src_really_odd = (((unsigned long) src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); /* We'll be able to pick-up our 9 char elements at src + stride from * those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD * as the next srcvA and srcvB. */ src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned * on the second vector. */ srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); else srcvD = src_1; srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); /* OK, now we (finally) do the math :-) * Those four instructions replace 32 int muls & 32 int adds. * Isn't AltiVec nice? */ tempA = vec_mladd((vector unsigned short) srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short) srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short) srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short) srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short) vczero); if (dst_odd) dstv2 = vec_perm(dstv, dstv2, vcprm(0, 1, s0, s1)); else dstv2 = vec_perm(dstv, dstv2, vcprm(s0, s1, 2, 3)); vec_st(dstv2, 0, dst); dst += stride; src += stride; } }
SIMD_INLINE v128_u16 Average16(const v128_u16 & s00, const v128_u16 & s01, const v128_u16 & s10, const v128_u16 & s11) { return vec_sr(vec_add(vec_add(vec_add(s00, s01), vec_add(s10, s11)), K16_0002), K16_0002); }