static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y, vector signed short U, vector signed short V, vector signed short *R, vector signed short *G, vector signed short *B) { vector signed short vx, ux, uvx; Y = vec_mradds(Y, c->CY, c->OY); U = vec_sub(U, (vector signed short) vec_splat((vector signed short) { 128 }, 0));
static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y, vector signed short U, vector signed short V, vector signed short *R, vector signed short *G, vector signed short *B) { vector signed short vx, ux, uvx; Y = vec_mradds(Y, c->CY, c->OY); U = vec_sub(U, (vector signed short) vec_splat((vector signed short) { 128 }, 0)); V = vec_sub(V, (vector signed short) vec_splat((vector signed short) { 128 }, 0)); // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15; ux = vec_sl(U, c->CSHIFT); *B = vec_mradds(ux, c->CBU, Y); // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15; vx = vec_sl(V, c->CSHIFT); *R = vec_mradds(vx, c->CRV, Y); // uvx = ((CGU * u) + (CGV * v)) >> 15; uvx = vec_mradds(U, c->CGU, Y); *G = vec_mradds(V, c->CGV, uvx); } /* * ------------------------------------------------------------------------------ * CS converters * ------------------------------------------------------------------------------ */
void BGRA_to_YCbCr_altivec(const unsigned char *bgradata, size_t BGRA_size, unsigned char *pixels) { vector signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128; vector unsigned char z0, tc0, tc1, tc2, tc3; vector signed short tr0, tr1, tg0, tg1, tb0, tb1; vector signed short t0, t1, t2, t3, t4, t5; vector signed short u1, u2, uAvg, v1, v2, vAvg, out1, out2, out3, out4, uv1, uv2; unsigned int i; const vector unsigned char *BGRA_ptr = reinterpret_cast<const vector unsigned char*>( bgradata); vector unsigned char *UYVY_ptr = reinterpret_cast<vector unsigned char*>( pixels); /* Permutation vector is used to extract the interleaved BGRA. */ vector unsigned char vPerm1 = static_cast<vector unsigned char>( 3, 7, 11, 15, 19, 23, 27, 31, // B0..B7 2, 6, 10, 14, 18, 22, 26, 30 /* G0..G7 */); vector unsigned char vPerm2 = static_cast<vector unsigned char>( 1, 5, 9, 13, 17, 21, 25, 29, /* R0..R7 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); /* Load the equation constants. */ vector signed short vConst1 = static_cast<vector signed short>( 8432, 16425, 3176, -4818, -9527, 14345, 0, 0 ); vector signed short vConst2 = static_cast<vector signed short>( 14345, -12045, -2300, 16, 128, 0, 0, 0 ); vector unsigned char avgPerm1 = static_cast<vector unsigned char>( 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 ); vector unsigned char avgPerm2 = static_cast<vector unsigned char>( 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 ); vector unsigned char Perm1 = static_cast<vector unsigned char>( 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ); vector unsigned char Perm2 = static_cast<vector unsigned char>( 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 ); r0 = vec_splat( vConst1, 2 ); /* 8432 */ g0 = vec_splat( vConst1, 1 ); /* 16425 */ b0 = vec_splat( vConst1, 0 ); /* 3176 */ r1 = vec_splat( vConst1, 5 ); /* -4818 */ g1 = vec_splat( vConst1, 4 ); /* -9527 */ b1 = vec_splat( vConst1, 3 ); /* 14345 */ r2 = vec_splat( vConst2, 2 ); /* 14345 */ g2 = vec_splat( vConst2, 1 ); /*-12045 */ b2 = vec_splat( vConst2, 0 ); /* -2300 */ c16 = vec_splat( vConst2, 3 ); /* 16 */ c128 = vec_splat( vConst2, 4 ); /* 128 */ c0 = static_cast<vector signed short> (0); /* 0 */ z0 = static_cast<vector unsigned char> (0); /* 0 */ for ( i = 0; i < (BGRA_size/sizeof(vector unsigned char)); i++ ) { /* Load the 4 BGRA input vectors and seperate into red, green and blue from the interleaved format. */ const vector unsigned char *vec1 = BGRA_ptr++; const vector unsigned char *vec2 = BGRA_ptr++; const vector unsigned char *vec3 = BGRA_ptr++; const vector unsigned char *vec4 = BGRA_ptr++; tc0 = vec_perm( *vec1, *vec2, vPerm1 ); // B0..B7 G0..G7 tc1 = vec_perm( *vec1, *vec2, vPerm2 ); // R0..R7 tc2 = vec_perm( *vec3, *vec4, vPerm1 ); // B8..B15 G8..G15 tc3 = vec_perm( *vec3, *vec4, vPerm2 ); // R8..R15 /* Unpack to 16 bit arithmatic for conversion. */ tr0 = static_cast<vector signed short>(vec_mergeh( z0, tc0 )); /* tr0 = R0 .. R7 */ tg0 = static_cast<vector signed short>(vec_mergel( z0, tc0 )); /* tg0 = G0 .. G7 */ tb0 = static_cast<vector signed short>(vec_mergeh( z0, tc1 )); /* tb0 = B0 .. B7 */ tr1 = static_cast<vector signed short>(vec_mergeh( z0, tc2 )); /* tr0 = R8 .. R15 */ tg1 = static_cast<vector signed short>(vec_mergel( z0, tc2 )); /* tg0 = G8 .. G15 */ tb1 = static_cast<vector signed short>(vec_mergeh( z0, tc3 )); /* tb0 = B8 .. B15 */ /* Convert the first three input vectors. Note that only the top 17 bits of the 32 bit product are stored. This is the same as doing the divide by 32768. */ t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) * 8432 */ t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */ t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */ t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) * 16425 */ t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) * -9527 */ t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */ t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) * 3176 */ t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */ t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */ /* Convert the next three input vectors. */ t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) * 8432 */ t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */ t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */ t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) * 16425 */ t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) * -9527 */ t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */ t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) * 3176 */ t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */ t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */ /* Add the constants. */ t0 = vec_adds( t0, c16 ); t3 = vec_adds( t3, c16 ); t1 = vec_adds( t1, c128 ); t4 = vec_adds( t4, c128 ); t2 = vec_adds( t2, c128 ); t5 = vec_adds( t5, c128 ); u1 = vec_perm( t1, t4, avgPerm1 ); // rearrange U's for averaging u2 = vec_perm( t1, t4, avgPerm2 ); uAvg = vec_avg( u1, u2 ); v1 = vec_perm( t2, t5, avgPerm1 ); // rearrange V's for averaging v2 = vec_perm( t2, t5, avgPerm2 ); vAvg = vec_avg( v1, v2 ); uv1 = vec_perm( uAvg, vAvg, Perm1 ); uv2 = vec_perm( uAvg, vAvg, Perm2 ); out1 = vec_perm( uv1, t0, Perm1 ); out2 = vec_perm( uv1, t0, Perm2 ); out3 = vec_perm( uv2, t3, Perm1 ); out4 = vec_perm( uv2, t3, Perm2 ); *UYVY_ptr = vec_packsu( out1, out2 ); // pack down to char's UYVY_ptr++; *UYVY_ptr = vec_packsu( out3, out4 ); UYVY_ptr++; } }
void dct_vmx (vector signed short *input, vector signed short *output, vector signed short *postscale) { vector signed short mul0, mul1, mul2, mul3, mul4, mul5, mul6, mul; vector signed short v0, v1, v2, v3, v4, v5, v6, v7, v8, v9; vector signed short v20, v21, v22, v23, v24, v25, v26, v27, v31; int i; vector signed short in[8], out[8]; /* Load first eight rows of input data */ /* Load multiplication constants */ /* Splat multiplication constants */ mul0 = vec_splat(input[8],0); mul1 = vec_splat(input[8],1); mul2 = vec_splat(input[8],2); mul3 = vec_splat(input[8],3); mul4 = vec_splat(input[8],4); mul5 = vec_splat(input[8],5); mul6 = vec_splat(input[8],6); /* Perform DCT on the eight columns */ /*********** Stage 1 ***********/ v8 = vec_adds (input[0], input[7]); v9 = vec_subs (input[0], input[7]); v0 = vec_adds (input[1], input[6]); v7 = vec_subs (input[1], input[6]); v1 = vec_adds (input[2], input[5]); v6 = vec_subs (input[2], input[5]); v2 = vec_adds (input[3], input[4]); v5 = vec_subs (input[3], input[4]); /*********** Stage 2 ***********/ /* Top */ v3 = vec_adds (v8, v2); /* (V0+V7) + (V3+V4) */ v4 = vec_subs (v8, v2); /* (V0+V7) - (V3+V4) */ v2 = vec_adds (v0, v1); /* (V1+V6) + (V2+V5) */ v8 = vec_subs (v0, v1); /* (V1+V6) - (V2+V5) */ /* Bottom */ v0 = vec_subs (v7, v6); /* (V1-V6) - (V2-V5) */ v1 = vec_adds (v7, v6); /* (V1-V6) + (V2-V5) */ /*********** Stage 3 ***********/ /* Top */ in[0] = vec_adds (v3, v2); /* y0 = v3 + v2 */ in[4] = vec_subs (v3, v2); /* y4 = v3 - v2 */ in[2] = vec_mradds (v8, mul2, v4); /* y2 = v8 * a0 + v4 */ v6 = vec_mradds (v4, mul2, mul6); in[6] = vec_subs (v6, v8); /* y6 = v4 * a0 - v8 */ /* Bottom */ v6 = vec_mradds (v0, mul0, v5); /* v6 = v0 * (c4) + v5 */ v7 = vec_mradds (v0, mul4, v5); /* v7 = v0 * (-c4) + v5 */ v2 = vec_mradds (v1, mul4, v9); /* v2 = v1 * (-c4) + v9 */ v3 = vec_mradds (v1, mul0, v9); /* v3 = v1 * (c4) + v9 */ /*********** Stage 4 ***********/ /* Bottom */ in[1] = vec_mradds (v6, mul3, v3); /* y1 = v6 * (a1) + v3 */ v23 = vec_mradds (v3, mul3, mul6); in[7] = vec_subs (v23, v6); /* y7 = v3 * (a1) - v6 */ in[5] = vec_mradds (v2, mul1, v7); /* y5 = v2 * (a2) + v7 */ in[3] = vec_mradds (v7, mul5, v2); /* y3 = v7 * (-a2) + v2 */ transpose_vmx (in, out); /* Perform DCT on the eight rows */ /*********** Stage 1 ***********/ v8 = vec_adds (out[0], out[7]); v9 = vec_subs (out[0], out[7]); v0 = vec_adds (out[1], out[6]); v7 = vec_subs (out[1], out[6]); v1 = vec_adds (out[2], out[5]); v6 = vec_subs (out[2], out[5]); v2 = vec_adds (out[3], out[4]); v5 = vec_subs (out[3], out[4]); /*********** Stage 2 ***********/ /* Top */ v3 = vec_adds (v8, v2); /* (V0+V7) + (V3+V4) */ v4 = vec_subs (v8, v2); /* (V0+V7) - (V3+V4) */ v2 = vec_adds (v0, v1); /* (V1+V6) + (V2+V5) */ v8 = vec_subs (v0, v1); /* (V1+V6) - (V2+V5) */ /* Bottom */ v0 = vec_subs (v7, v6); /* (V1-V6) - (V2-V5) */ v1 = vec_adds (v7, v6); /* (V1-V6) + (V2-V5) */ /*********** Stage 3 ***********/ /* Top */ v25 = vec_subs (v25, v25); /* reinit v25 = 0 */ v20 = vec_adds (v3, v2); /* y0 = v3 + v2 */ v24 = vec_subs (v3, v2); /* y4 = v3 - v2 */ v22 = vec_mradds (v8, mul2, v4); /* y2 = v8 * a0 + v4 */ v6 = vec_mradds (v4, mul2, v25); v26 = vec_subs (v6, v8); /* y6 = v4 * a0 - v8 */ /* Bottom */ v6 = vec_mradds (v0, mul0, v5); /* v6 = v0 * (c4) + v5 */ v7 = vec_mradds (v0, mul4, v5); /* v7 = v0 * (-c4) + v5 */ v2 = vec_mradds (v1, mul4, v9); /* v2 = v1 * (-c4) + v9 */ v3 = vec_mradds (v1, mul0, v9); /* v3 = v1 * (c4) + v9 */ /*********** Stage 4 ***********/ /* Bottom */ v21 = vec_mradds (v6, mul3, v3); /* y1 = v6 * (a1) + v3 */ v23 = vec_mradds (v3, mul3, v25); v27 = vec_subs (v23, v6); /* y7 = v3 * (a1) - v6 */ v25 = vec_mradds (v2, mul1, v7); /* y5 = v2 * (a2) + v7 */ v23 = vec_mradds (v7, mul5, v2); /* y3 = v7 * (-a2) + v2 */ /* Post-scale and store reults */ v31 = vec_subs (v31, v31); /* reinit v25 = 0 */ output[0] = vec_mradds (postscale[0], v20, v31); output[2] = vec_mradds (postscale[2], v22, v31); output[4] = vec_mradds (postscale[4], v24, v31); output[6] = vec_mradds (postscale[6], v26, v31); output[1] = vec_mradds (postscale[1], v21, v31); output[3] = vec_mradds (postscale[3], v23, v31); output[5] = vec_mradds (postscale[5], v25, v31); output[7] = vec_mradds (postscale[7], v27, v31); }
void BGR_to_YCbCr_altivec(const unsigned char *bgrdata, size_t BGR_size, unsigned char *pixels) { vector signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128; vector unsigned char z0, tc0, tc1, tc2, tc3; vector signed short tr0, tr1, tg0, tg1, tb0, tb1; vector signed short t0, t1, t2, t3, t4, t5; unsigned int i; const vector unsigned char *BGR_ptr = reinterpret_cast<const vector unsigned char*>( bgrdata); vector unsigned char *YCC_ptr = reinterpret_cast<vector unsigned char*>( pixels); /* Permutation vector is used to extract the interleaved RGB. */ vector unsigned char vPerm1 = static_cast<vector unsigned char>( 0, 3, 6, 9, 12, 15, 18, 21, /* R0..R7 */ 1, 4, 7, 10, 13, 16, 19, 22 /* G0..G7 */); vector unsigned char vPerm2 = static_cast<vector unsigned char>( 2, 5, 8, 11, 14, 17, 20, 23, /* B0..B7 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); vector unsigned char vPerm3 = static_cast<vector unsigned char>( 8, 11, 14, 17, 20, 23, 26, 29, /* R8..R15 */ 9, 12, 15, 18, 21, 24, 27, 30 /* G8..G15 */); vector unsigned char vPerm4 = static_cast<vector unsigned char>(10, 13, 16, 19, 22, 25, 28, 31, /* B8..B15 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); /* Load the equation constants. */ vector signed short vConst1 = static_cast<vector signed short>( 8432, 16425, 3176, -4818, -9527, 14345, 0, 0 ); vector signed short vConst2 = static_cast<vector signed short>( 14345, -12045, -2300, 16, 128, 0, 0, 0 ); r0 = vec_splat( vConst1, 0 ); /* 8432 */ g0 = vec_splat( vConst1, 1 ); /* 16425 */ b0 = vec_splat( vConst1, 2 ); /* 3176 */ r1 = vec_splat( vConst1, 3 ); /* -4818 */ g1 = vec_splat( vConst1, 4 ); /* -9527 */ b1 = vec_splat( vConst1, 5 ); /* 14345 */ r2 = vec_splat( vConst2, 0 ); /* 14345 */ g2 = vec_splat( vConst2, 1 ); /*-12045 */ b2 = vec_splat( vConst2, 2 ); /* -2300 */ c16 = vec_splat( vConst2, 3 ); /* 16 */ c128 = vec_splat( vConst2, 4 ); /* 128 */ c0 = static_cast<vector signed short> (0); /* 0 */ z0 = static_cast<vector unsigned char> (0); /* 0 */ for ( i = 0; i < (BGR_size/sizeof(vector unsigned char)); i+=3 ) { /* Load the 3 RGB input vectors and seperate into red, green and blue from the interleaved format. */ tc0 = vec_perm( BGR_ptr[i], BGR_ptr[i+1], vPerm1 ); /* R0..R7 G0..G7 */ tc1 = vec_perm( BGR_ptr[i], BGR_ptr[i+1], vPerm2 ); /* B0..B7 */ tc2 = vec_perm( BGR_ptr[i+1], BGR_ptr[i+2], vPerm3 ); /* R8..R15 G8..G15 */ tc3 = vec_perm( BGR_ptr[i+1], BGR_ptr[i+2], vPerm4 ); /* B8..B15 */ /* Unpack to 16 bit arithmatic for converstion. */ tr0 = static_cast<vector signed short>(vec_mergeh( z0, tc0 )); /* tr0 = R0 .. R7 */ tg0 = static_cast<vector signed short>(vec_mergel( z0, tc0 )); /* tg0 = G0 .. G7 */ tb0 = static_cast<vector signed short>(vec_mergeh( z0, tc1 )); /* tb0 = B0 .. B7 */ tr1 = static_cast<vector signed short>(vec_mergeh( z0, tc2 )); /* tr0 = R8 .. R15 */ tg1 = static_cast<vector signed short>(vec_mergel( z0, tc2 )); /* tg0 = G8 .. G15 */ tb1 = static_cast<vector signed short>(vec_mergeh( z0, tc3 )); /* tb0 = B8 .. B15 */ /* Convert the first three input vectors. Note that only the top 17 bits of the 32 bit product are stored. This is the same as doing the divide by 32768. */ t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) * 8432 */ t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */ t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */ t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) * 16425 */ t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) * -9527 */ t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */ t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) * 3176 */ t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */ t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */ /* Convert the next three input vectors. */ t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) * 8432 */ t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */ t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */ t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) * 16425 */ t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) * -9527 */ t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */ t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) * 3176 */ t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */ t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */ /* Add the constants. */ t0 = vec_adds( t0, c16 ); t3 = vec_adds( t3, c16 ); t1 = vec_adds( t1, c128 ); t4 = vec_adds( t4, c128 ); t2 = vec_adds( t2, c128 ); t5 = vec_adds( t5, c128 ); /* Pack the results, and store them. */ YCC_ptr[i] = vec_packsu( t0, t3 ); /* Y0 .. Y15 */ YCC_ptr[i+1] = vec_packsu( t1, t4 ); /* Cb0 .. Cb15 */ YCC_ptr[i+2] = vec_packsu( t2, t5 ); /* Cr0 .. Cr15 */ } }