void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; vec_u8_t perm_ldv = vec_lvsl(0, dst); vec_u8_t perm_stv = vec_lvsr(8, dst); const vec_u16_t onev = vec_splat_u16(1); const vec_u16_t twov = vec_splat_u16(2); const vec_u16_t sixv = vec_splat_u16(6); const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, -1,-1,-1,-1,-1,-1,-1,-1); LOAD_ZERO; dct[0] += 32; // rounding for the >>6 at the end s0 = vec_ld(0x00, (int16_t*)dct); s1 = vec_ld(0x10, (int16_t*)dct); s2 = vec_ld(0x20, (int16_t*)dct); s3 = vec_ld(0x30, (int16_t*)dct); s4 = vec_ld(0x40, (int16_t*)dct); s5 = vec_ld(0x50, (int16_t*)dct); s6 = vec_ld(0x60, (int16_t*)dct); s7 = vec_ld(0x70, (int16_t*)dct); IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); for (i = 0; i < h; i++) { tmp1 = vec_ld(i * src_stride1, src1); mask = vec_lvsl(i * src_stride1, src1); tmp2 = vec_ld(i * src_stride1 + 15, src1); a = vec_perm(tmp1, tmp2, mask); tmp1 = vec_ld(i * 16, src2); tmp2 = vec_ld(i * 16 + 15, src2); b = vec_perm(tmp1, tmp2, mask_); tmp1 = vec_ld(0, dst); mask = vec_lvsl(0, dst); tmp2 = vec_ld(15, dst); d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); edges = vec_perm(tmp2, tmp1, mask); align = vec_lvsr(0, dst); tmp2 = vec_perm(d, edges, align); tmp1 = vec_perm(edges, d, align); vec_st(tmp2, 15, dst); vec_st(tmp1, 0 , dst); dst += dst_stride; } }
static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, mask1, mask2; #if HAVE_BIGENDIAN vec_u8 tmp1, tmp2, mask, edges, align; mask1 = vec_lvsl(0, src1); mask2 = vec_lvsl(0, src2); mask = vec_lvsl(0, dst); align = vec_lvsr(0, dst); #endif for (i = 0; i < h; i++) { a = load_with_perm_vec(i * src_stride1, src1, mask1); b = load_with_perm_vec(i * 16, src2, mask2); d = vec_avg(a, b); put_unaligned_store_with_mask_align(d, dst, mask, align); dst += dst_stride; } }
static void vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } }
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) { int i; vector signed short d0, d1, d; vector unsigned char align; if(((long)dst)&15) //FIXME for(i=0; i<len-7; i+=8) { d0 = vec_ld(0, dst+i); d = float_to_int16_one_altivec(src+i); d1 = vec_ld(15, dst+i); d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); align = vec_lvsr(0, dst+i); d0 = vec_perm(d1, d, align); d1 = vec_perm(d, d1, align); vec_st(d0, 0, dst+i); vec_st(d1,15, dst+i); } else for(i=0; i<len-7; i+=8) { d = float_to_int16_one_altivec(src+i); vec_st(d, 0, dst+i); } }
void *mem_searchrn(void *s, size_t len) { vector unsigned char v_cr; vector unsigned char v_nl; vector unsigned char v0; vector unsigned char v_perm; vector unsigned char c; vector bool char rr, rn; vector bool char last_rr; char *p; ssize_t k; size_t block_num; unsigned f; if(unlikely(!s || !len)) return NULL; /* only do one prefetch, this covers nearly 128k */ block_num = DIV_ROUNDUP(len, 512); f = block_num >= 256 ? 0 : block_num << 16; f |= 512; vec_dst((const unsigned char *)s, f, 2); v_cr = vec_splat_u8('\r'); v_nl = vec_splat_u8('\n'); v0 = vec_splat_u8(0); last_rr = (vector bool char)v0; k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len; p = (char *)ALIGN_DOWN(s, SOVUC); c = vec_ldl(0, (const vector unsigned char *)p); if(unlikely(k > 0)) goto K_SHIFT; v_perm = vec_lvsl(0, (unsigned char *)s); c = vec_perm(c, v0, v_perm); v_perm = vec_lvsr(0, (unsigned char *)s); c = vec_perm(v0, c, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); k = -k; goto START_LOOP; do { p += SOVUC; c = vec_ldl(0, (const vector unsigned char *)p); k -= SOVUC; if(k > 0) { rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) { vec_dss(2); return p - 1; } START_LOOP: last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1); rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) { vec_dss(2); return p + vec_zpos(rr); } } } while(k > 0); k = -k; K_SHIFT: vec_dss(2); v_perm = vec_lvsr(0, (unsigned char *)k); c = vec_perm(v0, c, v_perm); v_perm = vec_lvsl(0, (unsigned char *)k); c = vec_perm(c, v0, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) return p - 1; rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) return p + vec_zpos(rr); return NULL; }
void f25() { *var_vec_u8++ = vec_lvsl(var_int[0], var_long_ptr[1]); *var_vec_u8++ = vec_lvsl(var_int[0], var_unsigned_long_ptr[1]); *var_vec_u8++ = vec_lvsr(var_int[0], var_long_ptr[1]); *var_vec_u8++ = vec_lvsr(var_int[0], var_unsigned_long_ptr[1]); }
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned int v10ui = vec_splat_u32(10); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v1ss = vec_splat_s16(1); const vector signed int v512si = (const vector signed int)AVV(512); const vector unsigned int v16ui = (const vector unsigned int)AVV(16); register int align = ((((unsigned long)src) - 2) % 16); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short psumA = vec_sub(pp1A, pp2A); const vector signed short psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vector unsigned char mperm = (const vector unsigned char) AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); int16_t *tmpbis = tmp - (tmpStride * 21); vector signed short tmpM2ssA = vec_ld(0, tmpbis); vector signed short tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpM1ssA = vec_ld(0, tmpbis); vector signed short tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP0ssA = vec_ld(0, tmpbis); vector signed short tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP1ssA = vec_ld(0, tmpbis); vector signed short tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP2ssA = vec_ld(0, tmpbis); vector signed short tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vector signed short tmpP3ssA = vec_ld(0, tmpbis); const vector signed short tmpP3ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; const vector signed int pp1Ae = vec_mule(sum1A, v20ss); const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); const vector signed int pp1Be = vec_mule(sum1B, v20ss); const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); const vector signed int pp2Ae = vec_mule(sum2A, v5ss); const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); const vector signed int pp2Be = vec_mule(sum2B, v5ss); const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); const vector signed int pp1cAe = vec_add(pp1Ae, v512si); const vector signed int pp1cAo = vec_add(pp1Ao, v512si); const vector signed int pp1cBe = vec_add(pp1Be, v512si); const vector signed int pp1cBo = vec_add(pp1Bo, v512si); const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); const vector signed int sumBe = vec_add(pp1cBe, pp32Be); const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); const vector signed int ssumAe = vec_sra(sumAe, v10ui); const vector signed int ssumAo = vec_sra(sumAo, v10ui); const vector signed int ssumBe = vec_sra(sumBe, v10ui); const vector signed int ssumBo = vec_sra(sumBo, v10ui); const vector signed short ssume = vec_packs(ssumAe, ssumBe); const vector signed short ssumo = vec_packs(ssumAo, ssumBo); const vector unsigned char sumv = vec_packsu(ssume, ssumo); const vector unsigned char sum = vec_perm(sumv, sumv, mperm); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char perm = vec_lvsl(0, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); uint8_t *srcbis = src - (srcStride * 2); const vector unsigned char srcM2a = vec_ld(0, srcbis); const vector unsigned char srcM2b = vec_ld(16, srcbis); const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); srcbis += srcStride; const vector unsigned char srcM1a = vec_ld(0, srcbis); const vector unsigned char srcM1b = vec_ld(16, srcbis); const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); srcbis += srcStride; const vector unsigned char srcP0a = vec_ld(0, srcbis); const vector unsigned char srcP0b = vec_ld(16, srcbis); const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); srcbis += srcStride; const vector unsigned char srcP1a = vec_ld(0, srcbis); const vector unsigned char srcP1b = vec_ld(16, srcbis); const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); srcbis += srcStride; const vector unsigned char srcP2a = vec_ld(0, srcbis); const vector unsigned char srcP2b = vec_ld(16, srcbis); const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); srcbis += srcStride; vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); for (i = 0 ; i < 16 ; i++) { const vector unsigned char srcP3a = vec_ld(0, srcbis); const vector unsigned char srcP3b = vec_ld(16, srcbis); const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); srcbis += srcStride; const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); register int align = ((((unsigned long)src) - 2) % 16); for (i = 0 ; i < 16 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); src += srcStride; dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); }
static void vector_fmul_add_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); #if 0 //FIXME: there is still something wrong if (step == 2) { int y; vector float d0, d1, s3, t2; vector unsigned int sel = vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0)); t1 = vec_ld(16, dst); for (i=0,y=0; i<len-3; i+=4,y+=8) { s0 = vec_ld(0,src0+i); s1 = vec_ld(0,src1+i); s2 = vec_ld(0,src2+i); // t0 = vec_ld(0, dst+y); //[x x x|a] // t1 = vec_ld(16, dst+y); //[b c d|e] t2 = vec_ld(31, dst+y); //[f g h|x] d = vec_madd(s0,s1,s2); // [A B C D] // [A A B B] // [C C D D] d0 = vec_perm(t0, t1, mask); // [a b c d] d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d] edges = vec_perm(t1, t0, mask); t0 = vec_perm(edges, d0, align); // [x x x|A] t1 = vec_perm(d0, edges, align); // [b B d|e] vec_stl(t0, 0, dst+y); d1 = vec_perm(t1, t2, mask); // [e f g h] d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h] edges = vec_perm(t2, t1, mask); t1 = vec_perm(edges, d1, align); // [b B d|C] t2 = vec_perm(d1, edges, align); // [f D h|x] vec_stl(t1, 16, dst+y); t0 = t1; vec_stl(t2, 31, dst+y); t1 = t2; } } else #endif if (step == 1 && src3 == 0) for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); s2 = vec_ld(0, src2+i); edges = vec_perm(t1 ,t0, mask); d = vec_madd(s0,s1,s2); t1 = vec_perm(d, edges, align); t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } else ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); }
/* for( unsigned pos = 0; pos < size; ++pos ) * dest[pos] += src[pos]; * Idea from: http://developer.apple.com/hardwaredrivers/ve/downloads/add.c */ void Vector::FastSoundWrite( float *dest, const float *src, unsigned size ) { if( size > 4 ) { int index = 0; vUInt8 one = (vUInt8)(1); vUInt8 srcMask = vec_add( vec_lvsl(15, src), one ); vUInt8 destMask = vec_add(vec_lvsl(15, dest), one ); vUInt8 storeMask = vec_lvsr( 0, dest ); vFloat load1Src = vec_ld( 0, src ); vFloat load1Dest = vec_ld( 0, dest ); vFloat store = (vFloat)(0.0f); // If dest is misaligned, pull the first loop iteration out. if( intptr_t(dest) & 0xF ) { vFloat load2Src = vec_ld( 15, src ); vFloat load2Dest = vec_ld( 15, dest ); load1Src = vec_perm( load1Src, load2Src, srcMask ); load1Dest = vec_perm( load1Dest, load2Dest, destMask ); load1Dest = vec_add( load1Dest, load1Src ); store = vec_perm( load1Dest, load1Dest, storeMask ); while( (intptr_t(dest) + index) & 0xC ) { vec_ste( store, index, dest ); index += 4; } load1Src = load2Src; store = load1Dest; load1Dest = load2Dest; src += 4; dest += 4; size -= 4; /* Incrementing the index is supposed to have the same effect * as incrementing dest but since we read from dest as well * we don't want to increment twice so decrement the index. */ // XXX: What in the world did I mean here? index -= 16; } while( size >= 32 ) { vFloat load2Src = vec_ld( 15, src ); vFloat load3Src = vec_ld( 31, src ); vFloat load4Src = vec_ld( 47, src ); vFloat load5Src = vec_ld( 63, src ); vFloat load6Src = vec_ld( 79, src ); vFloat load7Src = vec_ld( 95, src ); vFloat load8Src = vec_ld( 111, src ); vFloat load9Src = vec_ld( 127, src ); vFloat load2Dest = vec_ld( 15, dest ); vFloat load3Dest = vec_ld( 31, dest ); vFloat load4Dest = vec_ld( 47, dest ); vFloat load5Dest = vec_ld( 63, dest ); vFloat load6Dest = vec_ld( 79, dest ); vFloat load7Dest = vec_ld( 95, dest ); vFloat load8Dest = vec_ld( 111, dest ); vFloat load9Dest = vec_ld( 127, dest ); // Align the data. load1Src = vec_perm( load1Src, load2Src, srcMask ); load2Src = vec_perm( load2Src, load3Src, srcMask ); load3Src = vec_perm( load3Src, load4Src, srcMask ); load4Src = vec_perm( load4Src, load5Src, srcMask ); load5Src = vec_perm( load5Src, load6Src, srcMask ); load6Src = vec_perm( load6Src, load7Src, srcMask ); load7Src = vec_perm( load7Src, load8Src, srcMask ); load8Src = vec_perm( load8Src, load9Src, srcMask ); // Not load5Src, it's untouched and used later. load1Dest = vec_perm( load1Dest, load2Dest, destMask ); load2Dest = vec_perm( load2Dest, load3Dest, destMask ); load3Dest = vec_perm( load3Dest, load4Dest, destMask ); load4Dest = vec_perm( load4Dest, load5Dest, destMask ); load5Dest = vec_perm( load5Dest, load6Dest, destMask ); load6Dest = vec_perm( load6Dest, load7Dest, destMask ); load7Dest = vec_perm( load7Dest, load8Dest, destMask ); load8Dest = vec_perm( load8Dest, load9Dest, destMask ); // Not load9Dest. load1Dest = vec_add( load1Dest, load1Src ); load2Dest = vec_add( load2Dest, load2Src ); load3Dest = vec_add( load3Dest, load3Src ); load4Dest = vec_add( load4Dest, load4Src ); load5Dest = vec_add( load5Dest, load5Src ); load6Dest = vec_add( load6Dest, load6Src ); load7Dest = vec_add( load7Dest, load7Src ); load8Dest = vec_add( load8Dest, load8Src ); // Unalign the results. store = vec_perm( store, load1Dest, storeMask ); load1Dest = vec_perm( load1Dest, load2Dest, storeMask ); load2Dest = vec_perm( load2Dest, load3Dest, storeMask ); load3Dest = vec_perm( load3Dest, load4Dest, storeMask ); load4Dest = vec_perm( load4Dest, load5Dest, storeMask ); load5Dest = vec_perm( load5Dest, load6Dest, storeMask ); load6Dest = vec_perm( load6Dest, load7Dest, storeMask ); load7Dest = vec_perm( load7Dest, load8Dest, storeMask ); // store the results vec_st( store, index + 0, dest ); vec_st( load1Dest, index + 16, dest ); vec_st( load2Dest, index + 32, dest ); vec_st( load3Dest, index + 48, dest ); vec_st( load4Dest, index + 64, dest ); vec_st( load5Dest, index + 80, dest ); vec_st( load6Dest, index + 96, dest ); vec_st( load7Dest, index + 112, dest ); load1Src = load9Src; load1Dest = load9Dest; store = load8Dest; dest += 32; src += 32; size -= 32; } /* This completely baffles gcc's loop unrolling. If I make it > 3 instead, * then gcc produces 4 identical copies of the loop without scheduling them * in a sane manner (hence the manual unrolling above) but this loop will * never be executed more than 3 times so that code will never be used. * This produces code the way gcc _should_ do it by unrolling and scheduling * and then producing the rolled version. */ while( size & ~0x3 ) { vFloat load2Src = vec_ld( 15, src ); vFloat load2Dest = vec_ld( 15, dest ); load1Src = vec_perm( load1Src, load2Src, srcMask ); load1Dest = vec_perm( load1Dest, load2Dest, destMask ); load1Dest = vec_add( load1Dest, load1Src ); store = vec_perm( store, load1Dest, storeMask ); vec_st( store, index, dest ); load1Src = load2Src; store = load1Dest; load1Dest = load2Dest; src += 4; dest += 4; size -= 4; } // Store the remainder of the vector, if it was misaligned. if( index < 0 ) { store = vec_perm( store, store, storeMask ); while( index < 0 ) { vec_ste( store, index, dest ); index += 4; } } } /* If we account for both misaligned dest and src, there is really no way to * do this in vector code so do the last at most 3 elements in scalar code. */ while( size-- ) *(dest++) += *(src++); }