void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ) { vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_s16_t zerov, one; vec_s16_t temp1v, temp2v; vec_u16_t mfv; vec_u16_t biasv; vect_ushort_u mf_u; mf_u.s[0]=mf; mfv = vec_splat( mf_u.v, 0 ); vect_int_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); vect_ushort_u bias_u; bias_u.s[0]=bias; biasv = vec_splat(bias_u.v, 0); zerov = vec_splat_s16(0); one = vec_splat_s16(1); QUANT_4_U_DC(0); }
void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; vec_s16_t zerov, one; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; vec_u16_t biasvB; vec_s16_t temp1v, temp2v; vect_int_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); zerov = vec_splat_s16(0); one = vec_splat_s16(1); QUANT_16_U( 0, 16 ); }
void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) { vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; vec_s16_t zerov, one; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; vec_u16_t biasvB; vec_s16_t temp1v, temp2v; vect_int_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); zerov = vec_splat_s16(0); one = vec_splat_s16(1); int i; for ( i=0; i<4; i++ ) { QUANT_16_U( i*2*16, i*2*16+16 ); } }
void ff_vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64]) { vec_u8 t; IDCT_START // pixels are signed; so add 128*16 in addition to the normal 8 vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); eight = vec_add(eight, v2048); IDCT_1D(NOP, NOP) TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); IDCT_1D(ADD8, SHIFT4) #define PUT(a)\ t = vec_packsu(a, a);\ vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ vec_ste((vec_u32)t, 4, (unsigned int *)dst); PUT(b0) dst += stride; PUT(b1) dst += stride; PUT(b2) dst += stride; PUT(b3) dst += stride; PUT(b4) dst += stride; PUT(b5) dst += stride; PUT(b6) dst += stride; PUT(b7) }
uint32_t dequant_mpeg_intra_altivec_c(int16_t * data, const int16_t * coeff, const uint32_t quant, const uint32_t dcscalar, const uint16_t * mpeg_quant_matrices) { register const uint16_t *intra_matrix = get_intra_matrix(mpeg_quant_matrices); register const int16_t *coeff_ptr = coeff; register int16_t *data_ptr = data; register vec_sint16_t ox00; register vec_sint16_t level; register vec_sint16_t vec_2048; register vec_uint16_t vintra; register vec_uint32_t swap; register vec_uint32_t even,odd; register vec_uint32_t et,ot,t; vec_uint32_t vquant; vector bool short zero_less; vector bool short overflow; #ifdef DEBUG if((long)data & 0xf) fprintf(stderr, "xvidcore: error in dequant_mpeg_intra_altivec_c, incorrect align: %x\n", data); #endif /* Initialize */ ox00 = vec_splat_s16(0); *((uint32_t*)&vquant) = quant; vquant = vec_splat(vquant,0); swap = vec_rl(vquant, vec_splat_u32(-16)); vec_2048 = (vec_sint16_t)vec_rl(vec_splat_u16(8),vec_splat_u16(8)); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); /* Process the first */ data[0] = coeff[0] * dcscalar; if (data[0] < -2048) { data[0] = -2048; } else if (data[0] > 2047) { data[0] = 2047; } return 0; }
uint32_t dequant_h263_inter_altivec_c(int16_t *data, int16_t *coeff, const uint32_t quant, const uint16_t *mpeg_quant_matrices) { vector signed short acLevel; vector signed short vec_2048; vector unsigned short quant_m_2; vector unsigned short quant_add; vector unsigned short t; register vector unsigned int even; register vector unsigned int odd; register vector unsigned int high; register vector unsigned int low; register vector unsigned char zerovec; vector bool short equal_zero; vector bool short less_zero; vector bool short overflow; #ifdef DEBUG /* print alignment errors if this is on */ if(((unsigned)data) & 0x15) fprintf(stderr, "dequant_h263_inter_altivec_c:incorrect align, data: %lx\n", (long)data); #endif /* initialize */ *((unsigned short*)&quant_m_2) = (unsigned short)(quant << 1); quant_m_2 = vec_splat(quant_m_2,0); *((unsigned short*)&quant_add) = (unsigned short)(quant & 1 ? quant : quant - 1); quant_add = vec_splat(quant_add,0); vec_2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); zerovec = vec_splat_u8(0); /* dequant */ DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); DEQUANT_H263_INTER_ALTIVEC(); return 0; }
/* ************************************************************************* NAME: test_mladd USAGE: test_madd(); returns: void DESCRIPTION: see how the combination multiply/add operation works this will work on low order 16-bits REFERENCES: Ian Ollmann's Altivec Tutorial LIMITATIONS: GLOBAL VARIABLES: accessed: none modified: none FUNCTIONS CALLED: fprintf vec_madd - multiply two short vectors and add to the sum a short all in one operation REVISION HISTORY: STR Description of Revision Author 06-Mar-11 initial coding kaj ************************************************************************* */ void test_mladd(void) { vector unsigned short shortVector1 = { 0, 2, 4, 8, 16, 32, 64, 128 }; vector unsigned short addVector; vector unsigned short coeffVector; vector unsigned short resultVector; vector short shortVector2 = { -128, -64, -32, -16, 0, 16, 32, 64}; vector short addVector2 = { -10, -10, -10, -10, 0, 10, 10, 10}; vector short coeffVector2; vector short resultVector2; short printshort[SHORT_ARRAYSIZE] __attribute__ ((aligned (16))); coeffVector = vec_splat_u16(2); addVector = vec_splat_u16(0); /* print vectors performing mladd on */ fprintf(stderr,"-----------------------------------------------------------" "\n\n"); printVecUShorts("vec_mladd unsigned input vector 1", shortVector1, SHORT_ARRAYSIZE); printVecUShorts("vec_mladd unsigned input vector to add", addVector, SHORT_ARRAYSIZE); printVecUShorts("vec_mladd unsigned coeffvector to multiply", coeffVector, SHORT_ARRAYSIZE); /* calculate */ resultVector = vec_mladd(shortVector1,coeffVector,addVector); printVecUShorts("vec_mladd vector (Input*2+0)", resultVector, SHORT_ARRAYSIZE); /* signed shorts */ coeffVector2 = vec_splat_s16(2); /* print signed short vectors performing mladd on */ fprintf(stderr,"----------------------------------------------------------" "\n\n"); printVecShorts("vec_mladd signed input vector 1", shortVector2, SHORT_ARRAYSIZE); printVecShorts("vec_mladd signed input vector to add", addVector2, SHORT_ARRAYSIZE); printVecShorts("vec_mladd signed coeffvector to multiply", coeffVector2, SHORT_ARRAYSIZE); /* calculate */ resultVector2 = vec_mladd(shortVector2,coeffVector2,addVector2); printVecShorts("vec_mladd vector (Input*2 + 10(increment pos & neg by 10)", resultVector2,SHORT_ARRAYSIZE); } /* test_mladd */
void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] ) { vec_s16_t dcv; vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) ); vec_u16_t v6 = vec_splat_u16( 6 ); vec_s16_t dctv = vec_vsx_ld( 0, dct ); dctv = vec_sra( vec_add( dctv, v32 ), v6 ); dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) ); dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); idct8_dc_altivec( &p_dst[0], dcv ); dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) ); dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv ); }
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); DECLARE_ALIGNED_16(signed int, ABCD[4]) = {((8 - x) * (8 - y)), (( x) * (8 - y)), ((8 - x) * ( y)), (( x) * ( y))}; register int i; vec_u8 fperm; const vec_s32 vABCD = vec_ld(0, ABCD); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); LOAD_ZERO; const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_u16 v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; vec_u8 vsrc0uc, vsrc1uc; vec_s16 vsrc0ssH, vsrc1ssH; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; vec_s16 vsrc2ssH, vsrc3ssH, psum; vec_u8 vdst, ppsum, vfdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; } else { fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F}; } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); if (ABCD[3]) { if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); CHROMA_MC8_ALTIVEC_CORE } } else { vec_u8 vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); CHROMA_MC8_ALTIVEC_CORE } } } else {
/* this code assume that stride % 16 == 0 */ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); DECLARE_ALIGNED_16(signed int, ABCD[4]) = {((8 - x) * (8 - y)), ((x) * (8 - y)), ((8 - x) * (y)), ((x) * (y))}; register int i; vec_u8_t fperm; const vec_s32_t vABCD = vec_ld(0, ABCD); const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); LOAD_ZERO; const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_u16_t v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8_t vsrc0uc, vsrc1uc; vec_s16_t vsrc0ssH, vsrc1ssH; vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; vec_s16_t vsrc2ssH, vsrc3ssH, psum; vec_u8_t vdst, ppsum, vfdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sra(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vec_u8_t)vec_packsu(psum, psum); vfdst = vec_perm(vdst, ppsum, fperm); OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { vec_u8_t vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sr(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vec_u8_t)vec_pack(psum, psum); vfdst = vec_perm(vdst, ppsum, fperm); OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); }
/* AltiVec version of dct_unquantize_h263 this code assumes `block' is 16 bytes-aligned */ static void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); qadd = (qscale - 1) | 1; qmul = qscale << 1; if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) block[0] = block[0] * s->y_dc_scale; else block[0] = block[0] * s->c_dc_scale; }else qadd = 0; i = 1; nCoeffs= 63; //does not always use zigzag table } else { i = 0; nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } { register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); DECLARE_ALIGNED(16, short, qmul8) = qmul; DECLARE_ALIGNED(16, short, qadd8) = qadd; register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; register vector bool short blockv_null, blockv_neg; register short backup_0 = block[0]; register int j = 0; qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); nqaddv = vec_sub(vczero, qaddv); #if 0 // block *is* 16 bytes-aligned, it seems. // first make sure block[j] is 16 bytes-aligned for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } #endif // vectorize all the 16 bytes-aligned blocks // of 8 elements for(; (j + 7) <= nCoeffs ; j+=8) { blockv = vec_ld(j << 1, block); blockv_neg = vec_cmplt(blockv, vczero); blockv_null = vec_cmpeq(blockv, vczero); // choose between +qadd or -qadd as the third operand temp1 = vec_sel(qaddv, nqaddv, blockv_neg); // multiply & add (block{i,i+7} * qmul [+-] qadd) temp1 = vec_mladd(blockv, qmulv, temp1); // put 0 where block[{i,i+7} used to have 0 blockv = vec_sel(temp1, blockv, blockv_null); vec_st(blockv, j << 1, block); } // if nCoeffs isn't a multiple of 8, finish the job // using good old scalar units. // (we could do it using a truncated vector, // but I'm not sure it's worth the hassle) for(; j <= nCoeffs ; j++) { level = block[j]; if (level) { if (level < 0) { level = level * qmul - qadd; } else { level = level * qmul + qadd; } block[j] = level; } } if (i == 1) { // cheat. this avoid special-casing the first iteration block[0] = backup_0; } } }
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned int v10ui = vec_splat_u32(10); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v1ss = vec_splat_s16(1); const vector signed int v512si = (const vector signed int)AVV(512); const vector unsigned int v16ui = (const vector unsigned int)AVV(16); register int align = ((((unsigned long)src) - 2) % 16); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A); const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short psumA = vec_sub(pp1A, pp2A); const vector signed short psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); const vector unsigned char mperm = (const vector unsigned char) AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); int16_t *tmpbis = tmp - (tmpStride * 21); vector signed short tmpM2ssA = vec_ld(0, tmpbis); vector signed short tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpM1ssA = vec_ld(0, tmpbis); vector signed short tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP0ssA = vec_ld(0, tmpbis); vector signed short tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP1ssA = vec_ld(0, tmpbis); vector signed short tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; vector signed short tmpP2ssA = vec_ld(0, tmpbis); vector signed short tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vector signed short tmpP3ssA = vec_ld(0, tmpbis); const vector signed short tmpP3ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; const vector signed int pp1Ae = vec_mule(sum1A, v20ss); const vector signed int pp1Ao = vec_mulo(sum1A, v20ss); const vector signed int pp1Be = vec_mule(sum1B, v20ss); const vector signed int pp1Bo = vec_mulo(sum1B, v20ss); const vector signed int pp2Ae = vec_mule(sum2A, v5ss); const vector signed int pp2Ao = vec_mulo(sum2A, v5ss); const vector signed int pp2Be = vec_mule(sum2B, v5ss); const vector signed int pp2Bo = vec_mulo(sum2B, v5ss); const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui); const vector signed int pp3Ao = vec_mulo(sum3A, v1ss); const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui); const vector signed int pp3Bo = vec_mulo(sum3B, v1ss); const vector signed int pp1cAe = vec_add(pp1Ae, v512si); const vector signed int pp1cAo = vec_add(pp1Ao, v512si); const vector signed int pp1cBe = vec_add(pp1Be, v512si); const vector signed int pp1cBo = vec_add(pp1Bo, v512si); const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae); const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao); const vector signed int pp32Be = vec_sub(pp3Be, pp2Be); const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo); const vector signed int sumAe = vec_add(pp1cAe, pp32Ae); const vector signed int sumAo = vec_add(pp1cAo, pp32Ao); const vector signed int sumBe = vec_add(pp1cBe, pp32Be); const vector signed int sumBo = vec_add(pp1cBo, pp32Bo); const vector signed int ssumAe = vec_sra(sumAe, v10ui); const vector signed int ssumAo = vec_sra(sumAo, v10ui); const vector signed int ssumBe = vec_sra(sumBe, v10ui); const vector signed int ssumBo = vec_sra(sumBo, v10ui); const vector signed short ssume = vec_packs(ssumAe, ssumBe); const vector signed short ssumo = vec_packs(ssumAo, ssumBo); const vector unsigned char sumv = vec_packsu(ssume, ssumo); const vector unsigned char sum = vec_perm(sumv, sumv, mperm); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); }
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char perm = vec_lvsl(0, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); uint8_t *srcbis = src - (srcStride * 2); const vector unsigned char srcM2a = vec_ld(0, srcbis); const vector unsigned char srcM2b = vec_ld(16, srcbis); const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); srcbis += srcStride; const vector unsigned char srcM1a = vec_ld(0, srcbis); const vector unsigned char srcM1b = vec_ld(16, srcbis); const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); srcbis += srcStride; const vector unsigned char srcP0a = vec_ld(0, srcbis); const vector unsigned char srcP0b = vec_ld(16, srcbis); const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); srcbis += srcStride; const vector unsigned char srcP1a = vec_ld(0, srcbis); const vector unsigned char srcP1b = vec_ld(16, srcbis); const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); srcbis += srcStride; const vector unsigned char srcP2a = vec_ld(0, srcbis); const vector unsigned char srcP2b = vec_ld(16, srcbis); const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); srcbis += srcStride; vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); for (i = 0 ; i < 16 ; i++) { const vector unsigned char srcP3a = vec_ld(0, srcbis); const vector unsigned char srcP3b = vec_ld(16, srcbis); const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm); const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); srcbis += srcStride; const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA); const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB); const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA); const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB); const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA); const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); }
void gimp_composite_dodge_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d; vector unsigned char alpha_a,alpha_b,alpha; vector signed short ox0001=vec_splat_s16(1); union { vector signed short v; vector unsigned short vu; gushort u16[8]; } ah,al,bh,bl; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah.v=vec_unpackh((vector signed char)a); ah.v=vec_sl(ah.v,ox0008); al.v=vec_unpackl((vector signed char)a); al.v=vec_sl(al.v,ox0008); b=vec_nor(b,b); bh.v=vec_unpackh((vector signed char)b); bh.v=vec_and(bh.v,ox00ff); bh.v=vec_add(bh.v,ox0001); bl.v=vec_unpackl((vector signed char)b); bl.v=vec_and(bl.v,ox00ff); bl.v=vec_add(bl.v,ox0001); ah.u16[0]=ah.u16[0]/bh.u16[0]; ah.u16[1]=ah.u16[1]/bh.u16[1]; ah.u16[2]=ah.u16[2]/bh.u16[2]; ah.u16[4]=ah.u16[4]/bh.u16[4]; ah.u16[5]=ah.u16[5]/bh.u16[5]; ah.u16[6]=ah.u16[6]/bh.u16[6]; al.u16[0]=al.u16[0]/bl.u16[0]; al.u16[1]=al.u16[1]/bl.u16[1]; al.u16[2]=al.u16[2]/bl.u16[2]; al.u16[4]=al.u16[4]/bl.u16[4]; al.u16[5]=al.u16[5]/bl.u16[5]; al.u16[6]=al.u16[6]/bl.u16[6]; d=vec_packs(ah.vu,al.vu); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah.v=vec_unpackh((vector signed char)a); ah.v=vec_sl(ah.v,ox0008); al.v=vec_unpackl((vector signed char)a); al.v=vec_sl(al.v,ox0008); b=vec_nor(b,b); bh.v=vec_unpackh((vector signed char)b); bh.v=vec_and(bh.v,ox00ff); bh.v=vec_add(bh.v,ox0001); bl.v=vec_unpackl((vector signed char)b); bl.v=vec_and(bl.v,ox00ff); bl.v=vec_add(bl.v,ox0001); ah.u16[0]=ah.u16[0]/bh.u16[0]; ah.u16[1]=ah.u16[1]/bh.u16[1]; ah.u16[2]=ah.u16[2]/bh.u16[2]; ah.u16[4]=ah.u16[4]/bh.u16[4]; ah.u16[5]=ah.u16[5]/bh.u16[5]; ah.u16[6]=ah.u16[6]/bh.u16[6]; al.u16[0]=al.u16[0]/bl.u16[0]; al.u16[1]=al.u16[1]/bl.u16[1]; al.u16[2]=al.u16[2]/bl.u16[2]; al.u16[4]=al.u16[4]/bl.u16[4]; al.u16[5]=al.u16[5]/bl.u16[5]; al.u16[6]=al.u16[6]/bl.u16[6]; d=vec_packs(ah.vu,al.vu); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 permM2 = vec_lvsl(-2, src); const vec_u8 permM1 = vec_lvsl(-1, src); const vec_u8 permP0 = vec_lvsl(+0, src); const vec_u8 permP1 = vec_lvsl(+1, src); const vec_u8 permP2 = vec_lvsl(+2, src); const vec_u8 permP3 = vec_lvsl(+3, src); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u32 v10ui = vec_splat_u32(10); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v1ss = vec_splat_s16(1); const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); register int align = ((((unsigned long)src) - 2) % 16); vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, psumA, psumB; const vec_u8 mperm = (const vec_u8) {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; int16_t *tmpbis = tmp; vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP2ssA, tmpP2ssB; vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, ssumAe, ssumAo, ssumBe, ssumBo; vec_u8 fsum, sumv, sum; vec_s16 ssume, ssumo; src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8 srcR1 = vec_ld(-2, src); vec_u8 srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); sum2A = vec_adds(srcM1A, srcP2A); sum2B = vec_adds(srcM1B, srcP2B); sum3A = vec_adds(srcM2A, srcP3A); sum3B = vec_adds(srcM2B, srcP3B); pp1A = vec_mladd(sum1A, v20ss, sum3A); pp1B = vec_mladd(sum1B, v20ss, sum3B); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); psumA = vec_sub(pp1A, pp2A); psumB = vec_sub(pp1B, pp2B); vec_st(psumA, 0, tmp); vec_st(psumB, 16, tmp); src += srcStride; tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ } tmpM2ssA = vec_ld(0, tmpbis); tmpM2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpM1ssA = vec_ld(0, tmpbis); tmpM1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP0ssA = vec_ld(0, tmpbis); tmpP0ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP1ssA = vec_ld(0, tmpbis); tmpP1ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; tmpP2ssA = vec_ld(0, tmpbis); tmpP2ssB = vec_ld(16, tmpbis); tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpbis += tmpStride; tmpM2ssA = tmpM1ssA; tmpM2ssB = tmpM1ssB; tmpM1ssA = tmpP0ssA; tmpM1ssB = tmpP0ssB; tmpP0ssA = tmpP1ssA; tmpP0ssB = tmpP1ssB; tmpP1ssA = tmpP2ssA; tmpP1ssB = tmpP2ssB; tmpP2ssA = tmpP3ssA; tmpP2ssB = tmpP3ssB; pp1Ae = vec_mule(sum1A, v20ss); pp1Ao = vec_mulo(sum1A, v20ss); pp1Be = vec_mule(sum1B, v20ss); pp1Bo = vec_mulo(sum1B, v20ss); pp2Ae = vec_mule(sum2A, v5ss); pp2Ao = vec_mulo(sum2A, v5ss); pp2Be = vec_mule(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss); pp3Ae = vec_sra((vec_s32)sum3A, v16ui); pp3Ao = vec_mulo(sum3A, v1ss); pp3Be = vec_sra((vec_s32)sum3B, v16ui); pp3Bo = vec_mulo(sum3B, v1ss); pp1cAe = vec_add(pp1Ae, v512si); pp1cAo = vec_add(pp1Ao, v512si); pp1cBe = vec_add(pp1Be, v512si); pp1cBo = vec_add(pp1Bo, v512si); pp32Ae = vec_sub(pp3Ae, pp2Ae); pp32Ao = vec_sub(pp3Ao, pp2Ao); pp32Be = vec_sub(pp3Be, pp2Be); pp32Bo = vec_sub(pp3Bo, pp2Bo); sumAe = vec_add(pp1cAe, pp32Ae); sumAo = vec_add(pp1cAo, pp32Ao); sumBe = vec_add(pp1cBe, pp32Be); sumBo = vec_add(pp1cBo, pp32Bo); ssumAe = vec_sra(sumAe, v10ui); ssumAo = vec_sra(sumAo, v10ui); ssumBe = vec_sra(sumBe, v10ui); ssumBo = vec_sra(sumBo, v10ui); ssume = vec_packs(ssumAe, ssumBe); ssumo = vec_packs(ssumAo, ssumBo); sumv = vec_packsu(ssume, ssumo); sum = vec_perm(sumv, sumv, mperm); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }
static av_always_inline void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int my, int w, int is6tap) { LOAD_V_SUBPEL_FILTER(my-1); vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); vec_u16 c7 = vec_splat_u16(7); // we want pixels 0-7 to be in the even positions and 8-15 in the odd, // so combine this permute with the alignment permute vector align_vech = vec_lvsl(0, src); align_vecl = vec_sld(align_vech, align_vech, 8); if (w ==16) perm_vec = vec_mergeh(align_vech, align_vecl); else perm_vec = vec_mergeh(align_vech, align_vech); if (is6tap) s0 = load_with_perm_vec(-2*src_stride, src, perm_vec); s1 = load_with_perm_vec(-1*src_stride, src, perm_vec); s2 = load_with_perm_vec( 0*src_stride, src, perm_vec); s3 = load_with_perm_vec( 1*src_stride, src, perm_vec); if (is6tap) s4 = load_with_perm_vec( 2*src_stride, src, perm_vec); src += (2+is6tap)*src_stride; while (h --> 0) { if (is6tap) s5 = load_with_perm_vec(0, src, perm_vec); else s4 = load_with_perm_vec(0, src, perm_vec); FILTER_V(f16h, vec_mule); if (w == 16) { FILTER_V(f16l, vec_mulo); filt = vec_packsu(f16h, f16l); vec_st(filt, 0, dst); } else { filt = vec_packsu(f16h, f16h); if (w == 4) filt = (vec_u8)vec_splat((vec_u32)filt, 0); else vec_ste((vec_u32)filt, 4, (uint32_t*)dst); vec_ste((vec_u32)filt, 0, (uint32_t*)dst); } if (is6tap) s0 = s1; s1 = s2; s2 = s3; s3 = s4; if (is6tap) s4 = s5; dst += dst_stride; src += src_stride; } }
/** Do inverse transform on 8x8 block */ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64], int sign, int rangered) { vector signed short src0, src1, src2, src3, src4, src5, src6, src7; vector signed int s0, s1, s2, s3, s4, s5, s6, s7; vector signed int s8, s9, sA, sB, sC, sD, sE, sF; vector signed int t0, t1, t2, t3, t4, t5, t6, t7; const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); const vector unsigned int vec_7 = vec_splat_u32(7); const vector unsigned int vec_4 = vec_splat_u32(4); const vector signed int vec_4s = vec_splat_s32(4); const vector unsigned int vec_3 = vec_splat_u32(3); const vector unsigned int vec_2 = vec_splat_u32(2); const vector signed int vec_1s = vec_splat_s32(1); const vector unsigned int vec_1 = vec_splat_u32(1); const vector unsigned short rangered_shift = vec_splat_u16(1); const vector signed short signed_bias = vec_sl(vec_splat_s16(4), vec_splat_u16(4)); src0 = vec_ld( 0, block); src1 = vec_ld( 16, block); src2 = vec_ld( 32, block); src3 = vec_ld( 48, block); src4 = vec_ld( 64, block); src5 = vec_ld( 80, block); src6 = vec_ld( 96, block); src7 = vec_ld(112, block); s0 = vec_unpackl(src0); s1 = vec_unpackl(src1); s2 = vec_unpackl(src2); s3 = vec_unpackl(src3); s4 = vec_unpackl(src4); s5 = vec_unpackl(src5); s6 = vec_unpackl(src6); s7 = vec_unpackl(src7); s8 = vec_unpackh(src0); s9 = vec_unpackh(src1); sA = vec_unpackh(src2); sB = vec_unpackh(src3); sC = vec_unpackh(src4); sD = vec_unpackh(src5); sE = vec_unpackh(src6); sF = vec_unpackh(src7); STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); src0 = vec_pack(s8, s0); src1 = vec_pack(s9, s1); src2 = vec_pack(sA, s2); src3 = vec_pack(sB, s3); src4 = vec_pack(sC, s4); src5 = vec_pack(sD, s5); src6 = vec_pack(sE, s6); src7 = vec_pack(sF, s7); TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); s0 = vec_unpackl(src0); s1 = vec_unpackl(src1); s2 = vec_unpackl(src2); s3 = vec_unpackl(src3); s4 = vec_unpackl(src4); s5 = vec_unpackl(src5); s6 = vec_unpackl(src6); s7 = vec_unpackl(src7); s8 = vec_unpackh(src0); s9 = vec_unpackh(src1); sA = vec_unpackh(src2); sB = vec_unpackh(src3); sC = vec_unpackh(src4); sD = vec_unpackh(src5); sE = vec_unpackh(src6); sF = vec_unpackh(src7); STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); src0 = vec_pack(s8, s0); src1 = vec_pack(s9, s1); src2 = vec_pack(sA, s2); src3 = vec_pack(sB, s3); src4 = vec_pack(sC, s4); src5 = vec_pack(sD, s5); src6 = vec_pack(sE, s6); src7 = vec_pack(sF, s7); if (rangered) { if (!sign) { src0 = vec_sub(src0, signed_bias); src1 = vec_sub(src1, signed_bias); src2 = vec_sub(src2, signed_bias); src3 = vec_sub(src3, signed_bias); src4 = vec_sub(src4, signed_bias); src5 = vec_sub(src5, signed_bias); src6 = vec_sub(src6, signed_bias); src7 = vec_sub(src7, signed_bias); } src0 = vec_sl(src0, rangered_shift); src1 = vec_sl(src1, rangered_shift); src2 = vec_sl(src2, rangered_shift); src3 = vec_sl(src3, rangered_shift); src4 = vec_sl(src4, rangered_shift); src5 = vec_sl(src5, rangered_shift); src6 = vec_sl(src6, rangered_shift); src7 = vec_sl(src7, rangered_shift); } vec_st(src0, 0, block); vec_st(src1, 16, block); vec_st(src2, 32, block); vec_st(src3, 48, block); vec_st(src4, 64, block); vec_st(src5, 80, block); vec_st(src6, 96, block); vec_st(src7,112, block); }
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, int wrap, int16_t *filter) { int sum, i; const uint8_t *s; vector unsigned char *tv, tmp, dstv, zero; vec_ss_t srchv[4], srclv[4], fv[4]; vector signed short zeros, sumhv, sumlv; s = src; for(i=0;i<4;i++) { /* The vec_madds later on does an implicit >>15 on the result. Since FILTER_BITS is 8, and we have 15 bits of magnitude in a signed short, we have just enough bits to pre-shift our filter constants <<7 to compensate for vec_madds. */ fv[i].s[0] = filter[i] << (15-FILTER_BITS); fv[i].v = vec_splat(fv[i].v, 0); } zero = vec_splat_u8(0); zeros = vec_splat_s16(0); /* When we're resampling, we'd ideally like both our input buffers, and output buffers to be 16-byte aligned, so we can do both aligned reads and writes. Sadly we can't always have this at the moment, so we opt for aligned writes, as unaligned writes have a huge overhead. To do this, do enough scalar resamples to get dst 16-byte aligned. */ i = (-(int)dst) & 0xf; while(i>0) { sum = s[0 * wrap] * filter[0] + s[1 * wrap] * filter[1] + s[2 * wrap] * filter[2] + s[3 * wrap] * filter[3]; sum = sum >> FILTER_BITS; if (sum<0) sum = 0; else if (sum>255) sum=255; dst[0] = sum; dst++; s++; dst_width--; i--; } /* Do our altivec resampling on 16 pixels at once. */ while(dst_width>=16) { /* Read 16 (potentially unaligned) bytes from each of 4 lines into 4 vectors, and split them into shorts. Interleave the multipy/accumulate for the resample filter with the loads to hide the 3 cycle latency the vec_madds have. */ tv = (vector unsigned char *) &s[0 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); srclv[0].v = (vector signed short) vec_mergel(zero, tmp); sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); tv = (vector unsigned char *) &s[1 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); srclv[1].v = (vector signed short) vec_mergel(zero, tmp); sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); tv = (vector unsigned char *) &s[2 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); srclv[2].v = (vector signed short) vec_mergel(zero, tmp); sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); tv = (vector unsigned char *) &s[3 * wrap]; tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); srclv[3].v = (vector signed short) vec_mergel(zero, tmp); sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); /* Pack the results into our destination vector, and do an aligned write of that back to memory. */ dstv = vec_packsu(sumhv, sumlv) ; vec_st(dstv, 0, (vector unsigned char *) dst); dst+=16; s+=16; dst_width-=16; } /* If there are any leftover pixels, resample them with the slow scalar method. */ while(dst_width>0) { sum = s[0 * wrap] * filter[0] + s[1 * wrap] * filter[1] + s[2 * wrap] * filter[2] + s[3 * wrap] * filter[3]; sum = sum >> FILTER_BITS; if (sum<0) sum = 0; else if (sum>255) sum=255; dst[0] = sum; dst++; s++; dst_width--; } }
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride) { register int i; LOAD_ZERO; vec_u8 perm; #if HAVE_BIGENDIAN perm = vec_lvsl(0, src); #endif const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const uint8_t *srcbis = src - (srcStride * 2); const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB, srcP3ssA, srcP3ssB, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; vec_u8 sum, fsum, srcP3; for (i = 0 ; i < 16 ; i++) { srcP3 = load_with_perm_vec(0, srcbis, perm); srcbis += srcStride; srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1B = vec_adds(srcP0ssB, srcP1ssB); sum2A = vec_adds(srcM1ssA, srcP2ssA); sum2B = vec_adds(srcM1ssB, srcP2ssB); sum3A = vec_adds(srcM2ssA, srcP3ssA); sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride) { register int i; LOAD_ZERO; vec_u8 permM2, permM1, permP0, permP1, permP2, permP3; const vec_s16 v5ss = vec_splat_s16(5); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; register int align = ((((unsigned long)src) - 2) % 16); vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB; vec_u8 sum, fsum; #if HAVE_BIGENDIAN permM2 = vec_lvsl(-2, src); permM1 = vec_lvsl(-1, src); permP0 = vec_lvsl(+0, src); permP1 = vec_lvsl(+1, src); permP2 = vec_lvsl(+2, src); permP3 = vec_lvsl(+3, src); #endif /* HAVE_BIGENDIAN */ for (i = 0 ; i < 16 ; i ++) { load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3); srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); sum2A = vec_adds(srcM1A, srcP2A); sum2B = vec_adds(srcM1B, srcP2B); sum3A = vec_adds(srcM2A, srcP3A); sum3B = vec_adds(srcM2B, srcP3B); pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); src += srcStride; dst += dstStride; } }
/* this code assume that stride % 16 == 0 */ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { signed int ABCD[4] __attribute__((aligned(16))) = {((8 - x) * (8 - y)), ((x) * (8 - y)), ((8 - x) * (y)), ((x) * (y))}; register int i; vector unsigned char fperm; const vector signed int vABCD = vec_ld(0, ABCD); const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vector signed int vzero = vec_splat_s32(0); const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); const vector unsigned short v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vector unsigned char vsrc0uc, vsrc1uc; vector signed short vsrc0ssH, vsrc1ssH; vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; vector signed short vsrc2ssH, vsrc3ssH, psum; vector unsigned char vdst, ppsum, fsum; if (((unsigned long)dst) % 16 == 0) { fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v28ss, psum); psum = vec_sra(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vector unsigned char)vec_packsu(psum, psum); fsum = vec_perm(vdst, ppsum, fperm); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { vector unsigned char vsrcDuc; for (i = 0 ; i < h ; i++) { vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v28ss, psum); psum = vec_sr(psum, v6us); vdst = vec_ld(0, dst); ppsum = (vector unsigned char)vec_pack(psum, psum); fsum = vec_perm(vdst, ppsum, fperm); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } }
uint32_t dequant_h263_intra_altivec_c(int16_t *data, const int16_t *coeff, const uint32_t quant, const uint32_t dcscalar, const uint16_t *mpeg_quant_matrices) { vector signed short acLevel; vector signed short vec_2048; vector unsigned short quant_add; vector unsigned short quant_m_2; vector unsigned short t; vector bool short equal_zero; vector bool short less_zero; vector bool short overflow; register vector unsigned int even; register vector unsigned int odd; register vector unsigned int high; register vector unsigned int low; register vector unsigned char zerovec; register int16_t *data_ptr; register int16_t *coeff_ptr; #ifdef DEBUG if(((unsigned)data) & 0x15) fprintf(stderr, "dequant_h263_intra_altivec_c:incorrect align, data: %lx\n", (long)data); #endif /* initialize */ *((unsigned short*)&quant_add) = (unsigned short)(quant & 1 ? quant : quant - 1); quant_add = vec_splat(quant_add,0); *((unsigned short*)&quant_m_2) = (unsigned short)(quant << 1); quant_m_2 = vec_splat(quant_m_2,0); vec_2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); zerovec = vec_splat_u8(0); data_ptr = (int16_t*)data; coeff_ptr = (int16_t*)coeff; /* dequant */ DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); DEQUANT_H263_INTRA_ALTIVEC(); /* data[0] is special */ data[0] = coeff[0] * dcscalar; if(data[0] < -2048) data[0] = -2048; else if(data[0] > 2047) data[0] = 2047; return 0; }
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
uint32_t dequant_mpeg_inter_altivec_c(int16_t * data, const int16_t * coeff, const uint32_t quant, const uint16_t * mpeg_quant_matrices) { register uint32_t sum; register const uint16_t *inter_matrix = get_inter_matrix(mpeg_quant_matrices); register vec_sint16_t ox00; register vec_sint16_t v2048; register vec_sint16_t level; register vec_uint16_t vinter; register vec_uint32_t hi,lo; register vec_uint32_t sw_hi,sw_lo; register vec_uint32_t swap; register vec_uint32_t t,v16; vec_uint32_t vsum; vec_uint32_t vquant; vector bool short zero_eq; vector bool short zero_less; vector bool short overflow; #ifdef DEBUG if((long)data & 0xf) fprintf(stderr, "xvidcore: error in dequant_mpeg_inter_altivec_c, incorrect align: %x\n", data); #endif /* Initialization */ ox00 = vec_splat_s16(0); v16 = vec_splat_u32(-16); v2048 = vec_rl(vec_splat_s16(8),vec_splat_u16(8)); vsum = (vec_uint32_t)ox00; *((uint32_t*)&vquant) = quant; vquant = vec_splat(vquant,0); swap = vec_rl(vquant,v16); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); DEQUANT_MPEG_INTER(); sum = ((uint32_t*)&vsum)[0]; sum ^= ((uint32_t*)&vsum)[1]; sum ^= ((uint32_t*)&vsum)[2]; sum ^= ((uint32_t*)&vsum)[3]; /* mismatch control */ if((sum & 1) == 0) { data -= 1; *data ^= 1; } return 0; }
DECLARE_ALIGNED(16, signed int, ABCD)[4] = { ((8 - x) * (8 - y)), (( x) * (8 - y)), ((8 - x) * ( y)), (( x) * ( y)) }; register int i; vec_u8 fperm; const vec_s32 vABCD = vec_ld(0, ABCD); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); LOAD_ZERO; const vec_s16 v32ss = vec_sl(vec_splat_s16(1), vec_splat_u16(5)); const vec_u16 v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; vec_u8 vsrc0uc, vsrc1uc; vec_s16 vsrc0ssH, vsrc1ssH; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; vec_s16 vsrc2ssH, vsrc3ssH, psum; vec_u8 vdst, ppsum, vfdst, fsum; if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8) {
/* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; const vector signed int vzero = vec_splat_s32(0); const vector unsigned char permM2 = vec_lvsl(-2, src); const vector unsigned char permM1 = vec_lvsl(-1, src); const vector unsigned char permP0 = vec_lvsl(+0, src); const vector unsigned char permP1 = vec_lvsl(+1, src); const vector unsigned char permP2 = vec_lvsl(+2, src); const vector unsigned char permP3 = vec_lvsl(+3, src); const vector signed short v20ss = (const vector signed short)AVV(20); const vector unsigned short v5us = vec_splat_u16(5); const vector signed short v5ss = vec_splat_s16(5); const vector signed short v16ss = (const vector signed short)AVV(16); const vector unsigned char dstperm = vec_lvsr(0, dst); const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); register int align = ((((unsigned long)src) - 2) % 16); for (i = 0 ; i < 16 ; i ++) { vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vector unsigned char srcR1 = vec_ld(-2, src); vector unsigned char srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vector unsigned char srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0); const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0); const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1); const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1); const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2); const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2); const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3); const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3); const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1); const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1); const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2); const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2); const vector signed short sum1A = vec_adds(srcP0A, srcP1A); const vector signed short sum1B = vec_adds(srcP0B, srcP1B); const vector signed short sum2A = vec_adds(srcM1A, srcP2A); const vector signed short sum2B = vec_adds(srcM1B, srcP2B); const vector signed short sum3A = vec_adds(srcM2A, srcP3A); const vector signed short sum3B = vec_adds(srcM2B, srcP3B); const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss); const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss); const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); const vector signed short pp3A = vec_add(sum3A, pp1A); const vector signed short pp3B = vec_add(sum3B, pp1B); const vector signed short psumA = vec_sub(pp3A, pp2A); const vector signed short psumB = vec_sub(pp3B, pp2B); const vector signed short sumA = vec_sra(psumA, v5us); const vector signed short sumB = vec_sra(psumB, v5us); const vector unsigned char sum = vec_packsu(sumA, sumB); const vector unsigned char dst1 = vec_ld(0, dst); const vector unsigned char dst2 = vec_ld(16, dst); const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, sum, vdst); const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm); const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask); const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask); vec_st(fdst1, 0, dst); vec_st(fdst2, 16, dst); src += srcStride; dst += dstStride; } POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); }
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 perm = vec_lvsl(0, src); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v5ss = vec_splat_s16(5); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); uint8_t *srcbis = src - (srcStride * 2); const vec_u8 srcM2a = vec_ld(0, srcbis); const vec_u8 srcM2b = vec_ld(16, srcbis); const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); //srcbis += srcStride; const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcM1b = vec_ld(16, srcbis); const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); //srcbis += srcStride; const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP0b = vec_ld(16, srcbis); const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); //srcbis += srcStride; const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP1b = vec_ld(16, srcbis); const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); //srcbis += srcStride; const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP2b = vec_ld(16, srcbis); const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); //srcbis += srcStride; vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB, srcP3ssA, srcP3ssB, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; for (i = 0 ; i < 16 ; i++) { srcP3a = vec_ld(0, srcbis += srcStride); srcP3b = vec_ld(16, srcbis); srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); //srcbis += srcStride; sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1B = vec_adds(srcP0ssB, srcP1ssB); sum2A = vec_adds(srcM1ssA, srcP2ssA); sum2B = vec_adds(srcM1ssB, srcP2ssB); sum3A = vec_adds(srcM2ssA, srcP3ssA); sum3B = vec_adds(srcM2ssB, srcP3ssB); srcM2ssA = srcM1ssA; srcM2ssB = srcM1ssB; srcM1ssA = srcP0ssA; srcM1ssB = srcP0ssB; srcP0ssA = srcP1ssA; srcP0ssB = srcP1ssB; srcP1ssA = srcP2ssA; srcP1ssB = srcP2ssB; srcP2ssA = srcP3ssA; srcP2ssB = srcP3ssB; pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); dst += dstStride; } }
/* this code assume that stride % 16 == 0 */ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); signed int ABCD[4] __attribute__((aligned(16))); register int i; ABCD[0] = ((8 - x) * (8 - y)); ABCD[1] = ((x) * (8 - y)); ABCD[2] = ((8 - x) * (y)); ABCD[3] = ((x) * (y)); const vector signed int vABCD = vec_ld(0, ABCD); const vector signed short vA = vec_splat((vector signed short)vABCD, 1); const vector signed short vB = vec_splat((vector signed short)vABCD, 3); const vector signed short vC = vec_splat((vector signed short)vABCD, 5); const vector signed short vD = vec_splat((vector signed short)vABCD, 7); const vector signed int vzero = vec_splat_s32(0); const vector signed short v32ss = (const vector signed short)AVV(32); const vector unsigned short v6us = vec_splat_u16(6); vector unsigned char fperm; if (((unsigned long)dst) % 16 == 0) { fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); } else { fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); } register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; vector unsigned char vsrcAuc; vector unsigned char vsrcBuc; vector unsigned char vsrcperm0; vector unsigned char vsrcperm1; vsrcAuc = vec_ld(0, src); if (loadSecond) vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vector unsigned char vsrc0uc; vector unsigned char vsrc1uc; vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) vsrc1uc = vsrcBuc; else vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc); vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign for (i = 0 ; i < h ; i++) { vector unsigned char vsrcCuc; vsrcCuc = vec_ld(stride + 0, src); vector unsigned char vsrc2uc; vector unsigned char vsrc3uc; vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); vector signed short psum; psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sra(psum, v6us); vector unsigned char vdst = vec_ld(0, dst); vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum); vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } else { for (i = 0 ; i < h ; i++) { vector unsigned char vsrcCuc; vector unsigned char vsrcDuc; vsrcCuc = vec_ld(stride + 0, src); vsrcDuc = vec_ld(stride + 16, src); vector unsigned char vsrc2uc; vector unsigned char vsrc3uc; vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); if (reallyBadAlign) vsrc3uc = vsrcDuc; else vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc); vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc); vector signed short psum; psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vC, vsrc2ssH, psum); psum = vec_mladd(vD, vsrc3ssH, psum); psum = vec_add(v32ss, psum); psum = vec_sr(psum, v6us); vector unsigned char vdst = vec_ld(0, dst); vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum); vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm); vector unsigned char fsum; OP_U8_ALTIVEC(fsum, vfdst, vdst); vec_st(fsum, 0, dst); vsrc0ssH = vsrc2ssH; vsrc1ssH = vsrc3ssH; dst += stride; src += stride; } } POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); }
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { register int i; LOAD_ZERO; const vec_u8 permM2 = vec_lvsl(-2, src); const vec_u8 permM1 = vec_lvsl(-1, src); const vec_u8 permP0 = vec_lvsl(+0, src); const vec_u8 permP1 = vec_lvsl(+1, src); const vec_u8 permP2 = vec_lvsl(+2, src); const vec_u8 permP3 = vec_lvsl(+3, src); const vec_s16 v5ss = vec_splat_s16(5); const vec_u16 v5us = vec_splat_u16(5); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; register int align = ((((unsigned long)src) - 2) % 16); vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB; vec_u8 sum, fsum; for (i = 0 ; i < 16 ; i ++) { vec_u8 srcR1 = vec_ld(-2, src); vec_u8 srcR2 = vec_ld(14, src); switch (align) { default: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = vec_perm(srcR1, srcR2, permP3); } break; case 11: { srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = vec_perm(srcR1, srcR2, permP2); srcP3 = srcR2; } break; case 12: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = vec_perm(srcR1, srcR2, permP1); srcP2 = srcR2; srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); srcP1 = srcR2; srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { vec_u8 srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); srcP1 = vec_perm(srcR2, srcR3, permP1); srcP2 = vec_perm(srcR2, srcR3, permP2); srcP3 = vec_perm(srcR2, srcR3, permP3); } break; } srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); sum2A = vec_adds(srcM1A, srcP2A); sum2B = vec_adds(srcM1B, srcP2B); sum3A = vec_adds(srcM2A, srcP3A); sum3B = vec_adds(srcM2B, srcP3B); pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); psumA = vec_sub(pp3A, pp2A); psumB = vec_sub(pp3B, pp2B); sumA = vec_sra(psumA, v5us); sumB = vec_sra(psumB, v5us); sum = vec_packsu(sumA, sumB); ASSERT_ALIGNED(dst); OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); vec_st(fsum, 0, dst); src += srcStride; dst += dstStride; } }
void YUV422_to_BGRA_altivec(const unsigned char *yuvdata, size_t pixelnum, unsigned char *output) { const vector unsigned char *UYVY_ptr=reinterpret_cast<const vector unsigned char *>(yuvdata); vector unsigned char *BGRA_ptr=reinterpret_cast<vector unsigned char *>(output); vector unsigned int vShift; vector signed short tempU, tempV, tempY, tempUV, out1, out2, out3, out4; vector signed short v16, v128, a255, szero, one; vector unsigned char zero; vector signed short t0, t1, t2, tempGB1, tempGB2, tempRA1, tempRA2; vector signed short vU_G, vV_G, vU_B, vU_R, y0, hiImage, loImage; vector unsigned int uv_rEven, uv_rOdd, uv_rHi, uv_rLo, uv_gUEven, uv_gVEven, uv_gUOdd, uv_gVOdd, uv_gHi, uv_gLo, uv_bEven, uv_bOdd; vector signed int tempUhi, tempUlo, tempVhi, tempVlo; vector signed int yEven, yOdd; vector unsigned int t0Even, t0Odd, t1Even, t1Odd, t2Even, t2Odd; /* Load the equation constants. */ vector signed short vConst = static_cast<vector signed short>(298, 519, 409, 16, 128, 255, -100, -210 ); vector unsigned char vPerm1 = static_cast<vector unsigned char>( 0, 1, 16, 17, 8, 9, 24, 25, 2, 3, 18, 19, 10, 11, 26, 27 ); vector unsigned char vPerm2 = static_cast<vector unsigned char>( 4, 5, 20, 21, 12, 13, 28, 29, 6, 7, 22, 23, 14, 15, 30, 31 ); vector unsigned char vPermY = static_cast<vector unsigned char>( 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 ); vector unsigned char vPermU = static_cast<vector unsigned char>( 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 ); vector unsigned char vPermV = static_cast<vector unsigned char>( 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 ); vector unsigned char vOutPerm1 = static_cast<vector unsigned char>( 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 ); vector unsigned char vOutPerm2 = static_cast<vector unsigned char>( 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 ); vector unsigned char uvPerm = static_cast<vector unsigned char>( 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 ); zero = vec_splat_u8(0); szero = vec_splat_s16(0); one = vec_splat_s16(1); vShift = vec_splat_u32(8); a255 = vec_splat( vConst, 5 ); // alpha channel = 255 vU_G = vec_splat( vConst, 6 ); // -100 vV_G = vec_splat( vConst, 7 ); // -210 vU_B = vec_splat( vConst, 1 ); // 519 vU_R = vec_splat( vConst, 2 ); // 409 y0 = vec_splat( vConst, 0 ); // 298 v16 = vec_splat( vConst, 3 ); // 16 v128 = vec_splat( vConst, 4 ); // 128 for ( unsigned int i = 0; i < (pixelnum/sizeof(vector unsigned char)); i++ ) { // Load UYUV input vector const vector unsigned char *vec1 = UYVY_ptr++; //expand the UInt8's to short's hiImage = static_cast<vector signed short>(vec_mergeh( zero, *vec1 )); loImage = static_cast<vector signed short>(vec_mergel( zero, *vec1 )); tempUV = static_cast<vector signed short>(vec_perm( hiImage, loImage, uvPerm )); tempY = static_cast<vector signed short>(vec_perm( hiImage, loImage, vPermY )); // subtract UV_OFFSET from UV's (should this be saturated?) tempUV = static_cast<vector signed short>(vec_sub( tempUV, v128 )); // subtract Y-OFFSET from Y's (should this be saturated?) tempY = static_cast<vector signed short>(vec_sub( tempY, v16 )); // expand to UUUU UUUU and VVVV VVVV tempU = vec_perm(tempUV, tempUV, vPermU); tempV = vec_perm(tempUV, tempUV, vPermV); //below: // //error: cannot convert `vector int' to `vector unsigned int' in assignment tempUhi = vec_mule( tempU, one ); // unsigned int = vec_mule( signed short, signed short ) // should be // signed int = vec_mule( signed short, signed short ) tempUlo = vec_mulo( tempU, one ); tempVhi = vec_mule( tempV, one ); tempVlo = vec_mulo( tempV, one ); // uv_r = YUV2RGB_12*u + YUV2RGB_13*v // uv_r = (-1)*u + 409*v (or "409*V - U") uv_rEven = vec_mule( tempV, vU_R ); uv_rOdd = vec_mulo( tempV, vU_R ); uv_rHi = vec_sub( uv_rEven, tempUhi ); uv_rLo = vec_sub( uv_rOdd, tempUlo ); // uv_g = YUV2RGB_22*u + YUV2RGB_23*v // uv_g = -100*u + (-210)*v // multiply U by -100 uv_gUEven = vec_mule( tempU, vU_G ); uv_gUOdd = vec_mulo( tempU, vU_G ); // multiply V by -210 uv_gVEven = vec_mule( tempV, vV_G ); uv_gVOdd = vec_mulo( tempV, vV_G ); // add U & V products uv_gHi = vec_add( uv_gUEven, uv_gVEven ); uv_gLo = vec_add( uv_gUOdd, uv_gVOdd ); // uv_b = YUV2RGB_32*u + YUV2RGB_33*v // uv_b = 519*u + 0*v uv_bEven = vec_mule( tempU, vU_B ); uv_bOdd = vec_mulo( tempU, vU_B ); // y = YUV2RGB_11 * tempY // y = 298* (tempY - 16) yEven = vec_mule( tempY, y0 ); yOdd = vec_mulo( tempY, y0 ); // add while int's t0Even = vec_add( yEven, uv_bEven ); t0Odd = vec_add( yOdd, uv_bOdd ); t1Even = vec_add( yEven, uv_gHi ); t1Odd = vec_add( yOdd, uv_gLo ); t2Even = vec_add( yEven, uv_rHi ); t2Odd = vec_add( yOdd, uv_rLo ); // shift while int's t0Even = vec_sra( t0Even, vShift ); t0Odd = vec_sra( t0Odd, vShift ); t1Even = vec_sra( t1Even, vShift ); t1Odd = vec_sra( t1Odd, vShift ); t2Even = vec_sra( t2Even, vShift ); t2Odd = vec_sra( t2Odd, vShift ); // pack down to shorts t0 = vec_packs( t0Even, t0Odd ); t1 = vec_packs( t1Even, t1Odd ); t2 = vec_packs( t2Even, t2Odd ); // Permute to GBGBGBGB GBGBGBGB + re-interleave even & odd tempGB1 = vec_perm( t1, t0, vPerm1 ); tempGB2 = vec_perm( t1, t0, vPerm2 ); // Permute to ARARARAR ARARARAR + re-interleave even & odd tempRA1 = vec_perm( a255, t2, vPerm1 ); tempRA2 = vec_perm( a255, t2, vPerm2 ); // Permute to ARGB's out1 = vec_perm( tempRA1, tempGB1, vOutPerm1 ); out2 = vec_perm( tempRA1, tempGB1, vOutPerm2 ); out3 = vec_perm( tempRA2, tempGB2, vOutPerm1 ); out4 = vec_perm( tempRA2, tempGB2, vOutPerm2 ); // pack down to char's *BGRA_ptr = vec_packsu( out1, out2 ); BGRA_ptr++; *BGRA_ptr = vec_packsu( out3, out4 ); BGRA_ptr++; } }