static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; vec_u8 perm_ldv = vec_lvsl(0, dst); vec_u8 perm_stv = vec_lvsr(8, dst); const vec_u16 onev = vec_splat_u16(1); const vec_u16 twov = vec_splat_u16(2); const vec_u16 sixv = vec_splat_u16(6); const vec_u8 sel = (vec_u8) { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; LOAD_ZERO; dct[0] += 32; // rounding for the >>6 at the end s0 = vec_ld(0x00, (int16_t *)dct); s1 = vec_ld(0x10, (int16_t *)dct); s2 = vec_ld(0x20, (int16_t *)dct); s3 = vec_ld(0x30, (int16_t *)dct); s4 = vec_ld(0x40, (int16_t *)dct); s5 = vec_ld(0x50, (int16_t *)dct); s6 = vec_ld(0x60, (int16_t *)dct); s7 = vec_ld(0x70, (int16_t *)dct); IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); }
void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] ) { vec_u16_t onev = vec_splat_u16(1); vec_u16_t twov = vec_splat_u16(2); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; s0 = vec_ld(0x00, dct); s1 = vec_ld(0x10, dct); s2 = vec_ld(0x20, dct); s3 = vec_ld(0x30, dct); s4 = vec_ld(0x40, dct); s5 = vec_ld(0x50, dct); s6 = vec_ld(0x60, dct); s7 = vec_ld(0x70, dct); vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7; VEC_TRANSPOSE_8( d0, d1, d2, d3, d4, d5, d6, d7, tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7); vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; IDCT8_1D_ALTIVEC(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); vec_u8_t perm_ldv = vec_lvsl(0, dst); vec_u8_t perm_stv = vec_lvsr(8, dst); vec_u16_t sixv = vec_splat_u16(6); const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1); LOAD_ZERO; ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel); ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel); }