void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = {rounder, rounder, rounder, rounder, rounder, rounder, rounder, rounder}; const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = { (16-x16)*(16-y16), /* A */ ( x16)*(16-y16), /* B */ (16-x16)*( y16), /* C */ ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; int i; unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); Cv = vec_splat(tempA, 2); Dv = vec_splat(tempA, 3); rounderV = vec_ld(0, (unsigned short*)rounder_a); // we'll be able to pick-up our 9 char elements // at src from those 32 bytes // we load the first batch here, as inside the loop // we can re-use 'src+stride' from one iteration // as the 'src' of the next. src_0 = vec_ld(0, src); src_1 = vec_ld(16, src); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned // on the second vector. srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); } else { srcvB = src_1; } srcvA = vec_mergeh(vczero, srcvA); srcvB = vec_mergeh(vczero, srcvB); for(i=0; i<h; i++) { dst_odd = (unsigned long)dst & 0x0000000F; src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; dstv = vec_ld(0, dst); // we we'll be able to pick-up our 9 char elements // at src + stride from those 32 bytes // then reuse the resulting 2 vectors srvcC and srcvD // as the next srcvA and srcvB src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); if (src_really_odd != 0x0000000F) { // if src & 0xF == 0xF, then (src+1) is properly aligned // on the second vector. srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); } else { srcvD = src_1; } srcvC = vec_mergeh(vczero, srcvC); srcvD = vec_mergeh(vczero, srcvD); // OK, now we (finally) do the math :-) // those four instructions replaces 32 int muls & 32 int adds. // isn't AltiVec nice ? tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); srcvA = srcvC; srcvB = srcvD; tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short)vczero); if (dst_odd) { dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); } else { dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); } vec_st(dstv2, 0, dst); dst += stride; src += stride; } POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); }
/* AltiVec version of dct_unquantize_h263 this code assumes `block' is 16 bytes-aligned */ void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); qadd = (qscale - 1) | 1; qmul = qscale << 1; if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) block[0] = block[0] * s->y_dc_scale; else block[0] = block[0] * s->c_dc_scale; }else qadd = 0; i = 1; nCoeffs= 63; //does not allways use zigzag table } else { i = 0; nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } #ifdef ALTIVEC_USE_REFERENCE_C_CODE for(;i<=nCoeffs;i++) { level = block[i]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[i] = level; } } #else /* ALTIVEC_USE_REFERENCE_C_CODE */ { register const vector short vczero = (const vector short)vec_splat_s16(0); short __attribute__ ((aligned(16))) qmul8[] = { qmul, qmul, qmul, qmul, qmul, qmul, qmul, qmul }; short __attribute__ ((aligned(16))) qadd8[] = { qadd, qadd, qadd, qadd, qadd, qadd, qadd, qadd }; short __attribute__ ((aligned(16))) nqadd8[] = { -qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd }; register vector short blockv, qmulv, qaddv, nqaddv, temp1; register vector bool short blockv_null, blockv_neg; register short backup_0 = block[0]; register int j = 0; qmulv = vec_ld(0, qmul8); qaddv = vec_ld(0, qadd8); nqaddv = vec_ld(0, nqadd8); #if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } #endif // vectorize all the 16 bytes-aligned blocks // of 8 elements for(; (j + 7) <= nCoeffs ; j+=8) { blockv = vec_ld(j << 1, block); blockv_neg = vec_cmplt(blockv, vczero); blockv_null = vec_cmpeq(blockv, vczero); // choose between +qadd or -qadd as the third operand temp1 = vec_sel(qaddv, nqaddv, blockv_neg); // multiply & add (block{i,i+7} * qmul [+-] qadd) temp1 = vec_mladd(blockv, qmulv, temp1); // put 0 where block[{i,i+7} used to have 0 blockv = vec_sel(temp1, blockv, blockv_null); vec_st(blockv, j << 1, block); } // if nCoeffs isn't a multiple of 8, finish the job // using good old scalar units. // (we could do it using a truncated vector, // but I'm not sure it's worth the hassle) for(; j <= nCoeffs ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } if (i == 1) { // cheat. this avoid special-casing the first iteration block[0] = backup_0; } } #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); }
void fdct_altivec(int16_t *block) { POWERPC_PERF_DECLARE(altivec_fdct, 1); vector signed short *bp; vector float *cp; vector float b00, b10, b20, b30, b40, b50, b60, b70; vector float b01, b11, b21, b31, b41, b51, b61, b71; vector float mzero, cnst, cnsts0, cnsts1, cnsts2; vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; POWERPC_PERF_START_COUNT(altivec_fdct, 1); /* setup constants {{{ */ /* mzero = -0.0 */ mzero = ((vector float)vec_splat_u32(-1)); mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); cp = fdctconsts; cnsts0 = vec_ld(0, cp); cp++; cnsts1 = vec_ld(0, cp); cp++; cnsts2 = vec_ld(0, cp); /* }}} */ /* 8x8 matrix transpose (vector short[8]) {{{ */ #define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) bp = (vector signed short*)block; b00 = ((vector float)vec_ld(0, bp)); b40 = ((vector float)vec_ld(16*4, bp)); b01 = ((vector float)MERGE_S16(h, b00, b40)); b11 = ((vector float)MERGE_S16(l, b00, b40)); bp++; b10 = ((vector float)vec_ld(0, bp)); b50 = ((vector float)vec_ld(16*4, bp)); b21 = ((vector float)MERGE_S16(h, b10, b50)); b31 = ((vector float)MERGE_S16(l, b10, b50)); bp++; b20 = ((vector float)vec_ld(0, bp)); b60 = ((vector float)vec_ld(16*4, bp)); b41 = ((vector float)MERGE_S16(h, b20, b60)); b51 = ((vector float)MERGE_S16(l, b20, b60)); bp++; b30 = ((vector float)vec_ld(0, bp)); b70 = ((vector float)vec_ld(16*4, bp)); b61 = ((vector float)MERGE_S16(h, b30, b70)); b71 = ((vector float)MERGE_S16(l, b30, b70)); x0 = ((vector float)MERGE_S16(h, b01, b41)); x1 = ((vector float)MERGE_S16(l, b01, b41)); x2 = ((vector float)MERGE_S16(h, b11, b51)); x3 = ((vector float)MERGE_S16(l, b11, b51)); x4 = ((vector float)MERGE_S16(h, b21, b61)); x5 = ((vector float)MERGE_S16(l, b21, b61)); x6 = ((vector float)MERGE_S16(h, b31, b71)); x7 = ((vector float)MERGE_S16(l, b31, b71)); b00 = ((vector float)MERGE_S16(h, x0, x4)); b10 = ((vector float)MERGE_S16(l, x0, x4)); b20 = ((vector float)MERGE_S16(h, x1, x5)); b30 = ((vector float)MERGE_S16(l, x1, x5)); b40 = ((vector float)MERGE_S16(h, x2, x6)); b50 = ((vector float)MERGE_S16(l, x2, x6)); b60 = ((vector float)MERGE_S16(h, x3, x7)); b70 = ((vector float)MERGE_S16(l, x3, x7)); #undef MERGE_S16 /* }}} */ /* Some of the initial calculations can be done as vector short before * conversion to vector float. The following code section takes advantage * of this. */ #if 1 /* fdct rows {{{ */ x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); #define CTF0(n) \ b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ b##n##1 = vec_ctf(vs32(b##n##1), 0); \ b##n##0 = vec_ctf(vs32(b##n##0), 0); CTF0(0); CTF0(4); b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); CTF0(2); CTF0(6); #undef CTF0 x0 = vec_add(b60, b20); x1 = vec_add(b61, b21); cnst = LD_W2; x0 = vec_madd(cnst, x0, mzero); x1 = vec_madd(cnst, x1, mzero); cnst = LD_W1; b20 = vec_madd(cnst, b20, x0); b21 = vec_madd(cnst, b21, x1); cnst = LD_W0; b60 = vec_madd(cnst, b60, x0); b61 = vec_madd(cnst, b61, x1); #define CTFX(x,b) \ b##0 = ((vector float)vec_unpackh(vs16(x))); \ b##1 = ((vector float)vec_unpackl(vs16(x))); \ b##0 = vec_ctf(vs32(b##0), 0); \ b##1 = vec_ctf(vs32(b##1), 0); \ CTFX(x4, b7); CTFX(x5, b5); CTFX(x6, b3); CTFX(x7, b1); #undef CTFX x0 = vec_add(b70, b10); x1 = vec_add(b50, b30); x2 = vec_add(b70, b30); x3 = vec_add(b50, b10); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b70 = vec_madd(cnst, b70, x0); cnst = LD_W5; b50 = vec_madd(cnst, b50, x1); cnst = LD_W6; b30 = vec_madd(cnst, b30, x1); cnst = LD_W7; b10 = vec_madd(cnst, b10, x0); b70 = vec_add(b70, x2); b50 = vec_add(b50, x3); b30 = vec_add(b30, x2); b10 = vec_add(b10, x3); x0 = vec_add(b71, b11); x1 = vec_add(b51, b31); x2 = vec_add(b71, b31); x3 = vec_add(b51, b11); x8 = vec_add(x2, x3); cnst = LD_W3; x8 = vec_madd(cnst, x8, mzero); cnst = LD_W8; x0 = vec_madd(cnst, x0, mzero); cnst = LD_W9; x1 = vec_madd(cnst, x1, mzero); cnst = LD_WA; x2 = vec_madd(cnst, x2, x8); cnst = LD_WB; x3 = vec_madd(cnst, x3, x8); cnst = LD_W4; b71 = vec_madd(cnst, b71, x0); cnst = LD_W5; b51 = vec_madd(cnst, b51, x1); cnst = LD_W6; b31 = vec_madd(cnst, b31, x1); cnst = LD_W7; b11 = vec_madd(cnst, b11, x0); b71 = vec_add(b71, x2); b51 = vec_add(b51, x3); b31 = vec_add(b31, x2); b11 = vec_add(b11, x3); /* }}} */ #else /* convert to float {{{ */ #define CTF(n) \ vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \ vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \ b##n##1 = vec_ctf(vs32(b##n##1), 0); \ b##n##0 = vec_ctf(vs32(b##n##0), 0); \ CTF(0); CTF(1); CTF(2); CTF(3); CTF(4); CTF(5); CTF(6); CTF(7); #undef CTF /* }}} */ FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70); FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71); #endif /* 8x8 matrix transpose (vector float[8][2]) {{{ */ x0 = vec_mergel(b00, b20); x1 = vec_mergeh(b00, b20); x2 = vec_mergel(b10, b30); x3 = vec_mergeh(b10, b30); b00 = vec_mergeh(x1, x3); b10 = vec_mergel(x1, x3); b20 = vec_mergeh(x0, x2); b30 = vec_mergel(x0, x2); x4 = vec_mergel(b41, b61); x5 = vec_mergeh(b41, b61); x6 = vec_mergel(b51, b71); x7 = vec_mergeh(b51, b71); b41 = vec_mergeh(x5, x7); b51 = vec_mergel(x5, x7); b61 = vec_mergeh(x4, x6); b71 = vec_mergel(x4, x6); x0 = vec_mergel(b01, b21); x1 = vec_mergeh(b01, b21); x2 = vec_mergel(b11, b31); x3 = vec_mergeh(b11, b31); x4 = vec_mergel(b40, b60); x5 = vec_mergeh(b40, b60); x6 = vec_mergel(b50, b70); x7 = vec_mergeh(b50, b70); b40 = vec_mergeh(x1, x3); b50 = vec_mergel(x1, x3); b60 = vec_mergeh(x0, x2); b70 = vec_mergel(x0, x2); b01 = vec_mergeh(x5, x7); b11 = vec_mergel(x5, x7); b21 = vec_mergeh(x4, x6); b31 = vec_mergel(x4, x6); /* }}} */ FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); /* round, convert back to short {{{ */ #define CTS(n) \ b##n##0 = vec_round(b##n##0); \ b##n##1 = vec_round(b##n##1); \ b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ vec_st(vs16(b##n##0), 0, bp); bp = (vector signed short*)block; CTS(0); bp++; CTS(1); bp++; CTS(2); bp++; CTS(3); bp++; CTS(4); bp++; CTS(5); bp++; CTS(6); bp++; CTS(7); #undef CTS /* }}} */ POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); }
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); DECLARE_ALIGNED(16, signed int, ABCD)[4] = {((8 - x) * (8 - y)), (( x) * (8 - y)), ((8 - x) * ( y)), (( x) * ( y))}; register int i; vec_u8 fperm; const vec_s32 vABCD = vec_ld(0, ABCD); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); LOAD_ZERO; const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_u16 v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; vec_u8 vsrc0uc, vsrc1uc; vec_s16 vsrc0ssH, vsrc1ssH; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; vec_s16 vsrc2ssH, vsrc3ssH, psum; vec_u8 vdst, ppsum, vfdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; } else { fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F}; } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); if (ABCD[3]) { if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) } } else { vec_u8 vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) } } } else {