// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
/* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; int i; for (i = 0; i < h; i++) { /* block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */ int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16, pixels); pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); if (rightside) { pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); } else { pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); } blockv = vec_avg(blockv, pixelsv); vec_st(blockv, 0, block); pixels += line_size; block += line_size; } }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); for (i = 0; i < h; i++) { tmp1 = vec_ld(i * src_stride1, src1); mask = vec_lvsl(i * src_stride1, src1); tmp2 = vec_ld(i * src_stride1 + 15, src1); a = vec_perm(tmp1, tmp2, mask); tmp1 = vec_ld(i * 16, src2); tmp2 = vec_ld(i * 16 + 15, src2); b = vec_perm(tmp1, tmp2, mask_); tmp1 = vec_ld(0, dst); mask = vec_lvsl(0, dst); tmp2 = vec_ld(15, dst); d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); edges = vec_perm(tmp2, tmp1, mask); align = vec_lvsr(0, dst); tmp2 = vec_perm(d, edges, align); tmp1 = vec_perm(edges, d, align); vec_st(tmp2, 15, dst); vec_st(tmp1, 0 , dst); dst += dst_stride; } }
int main () { k = vec_add (a1, a2); if (!vec_all_eq (addi, k)) abort (); k = vec_avg (a1, a2); if (!vec_all_eq (k, avgi)) abort (); h = vec_add (f1, f2); if (!vec_all_eq (h, addf)) abort (); return 0; }
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; register vector unsigned char perm = vec_lvsl(0, pixels); int i; for (i = 0; i < h; i++) { pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16,pixels); blockv = vec_ld(0, block); pixelsv = vec_perm(pixelsv1, pixelsv2, perm); blockv = vec_avg(blockv,pixelsv); vec_st(blockv, 0, (unsigned char*)block); pixels+=line_size; block +=line_size; } }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, mask_; #if HAVE_BIGENDIAN vec_u8 tmp1, tmp2, mask, edges, align; mask_ = vec_lvsl(0, src2); #endif for (i = 0; i < h; i++) { a = unaligned_load(i * src_stride1, src1); b = load_with_perm_vec(i * 16, src2, mask_); d = vec_avg(a, b); put_unligned_store(d, dst); dst += dst_stride; } }
static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int h) { int i; vec_u8 a, b, d, mask1, mask2; #if HAVE_BIGENDIAN vec_u8 tmp1, tmp2, mask, edges, align; mask1 = vec_lvsl(0, src1); mask2 = vec_lvsl(0, src2); mask = vec_lvsl(0, dst); align = vec_lvsr(0, dst); #endif mask1 = vec_lvsl(0, src1); for (i = 0; i < h; i++) { a = load_with_perm_vec(i * src_stride1, src1, mask1); b = load_with_perm_vec(i * 16, src2, mask2); d = vec_avg(a, b); avg_unaligned_store_with_mask_align(d, dst, mask, align); dst += dst_stride; } }
/* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2, blocktemp; register vector unsigned short pixelssum1, pixelssum2, temp3; register const vector unsigned char vczero = (const vector unsigned char) vec_splat_u8(0); register const vector unsigned short vctwo = (const vector unsigned short) vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum1 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); temp1 = vec_ld(line_size, pixels); temp2 = vec_ld(line_size + 16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { pixelsv2 = temp2; } else { pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); } pixelsv1 = vec_mergeh(vczero, pixelsv1); pixelsv2 = vec_mergeh(vczero, pixelsv2); pixelssum2 = vec_add((vector unsigned short)pixelsv1, (vector unsigned short)pixelsv2); temp3 = vec_add(pixelssum1, pixelssum2); temp3 = vec_sra(temp3, vctwo); pixelssum1 = vec_add(pixelssum2, vctwo); pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); if (rightside) { blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); } else { blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); } blockv = vec_avg(blocktemp, blockv); vec_st(blockv, 0, block); block += line_size; pixels += line_size; } }
void MergeAltivec( void *_p_dest, const void *_p_s1, const void *_p_s2, size_t i_bytes ) { uint8_t *p_dest = (uint8_t *)_p_dest; uint8_t *p_s1 = (uint8_t *)_p_s1; uint8_t *p_s2 = (uint8_t *)_p_s2; uint8_t *p_end = p_dest + i_bytes - 15; /* Use C until the first 16-bytes aligned destination pixel */ while( (uintptr_t)p_dest & 0xF ) { *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1; } if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) ) { /* Unaligned source */ vector unsigned char s1v, s2v, destv; vector unsigned char s1oldv, s2oldv, s1newv, s2newv; vector unsigned char perm1v, perm2v; perm1v = vec_lvsl( 0, p_s1 ); perm2v = vec_lvsl( 0, p_s2 ); s1oldv = vec_ld( 0, p_s1 ); s2oldv = vec_ld( 0, p_s2 ); while( p_dest < p_end ) { s1newv = vec_ld( 16, p_s1 ); s2newv = vec_ld( 16, p_s2 ); s1v = vec_perm( s1oldv, s1newv, perm1v ); s2v = vec_perm( s2oldv, s2newv, perm2v ); s1oldv = s1newv; s2oldv = s2newv; destv = vec_avg( s1v, s2v ); vec_st( destv, 0, p_dest ); p_s1 += 16; p_s2 += 16; p_dest += 16; } } else { /* Aligned source */ vector unsigned char s1v, s2v, destv; while( p_dest < p_end ) { s1v = vec_ld( 0, p_s1 ); s2v = vec_ld( 0, p_s2 ); destv = vec_avg( s1v, s2v ); vec_st( destv, 0, p_dest ); p_s1 += 16; p_s2 += 16; p_dest += 16; } } p_end += 15; while( p_dest < p_end ) { *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1; } }
/* Test bounding box for intersection with view fustrum. * Return val: 0 = reject * 1 = accept * 2 = trivially accept (entirely in fustrum) */ int cliptest_bboxf(bboxf_t bv) { // return 2; static int corner_index[8][3] = { {0, 1, 2}, {3, 1, 2}, {3, 4, 2}, {0, 4, 2}, {0, 1, 5}, {3, 1, 5}, {3, 4, 5}, {0, 4, 5} }; vec4_t corner[8]; int clipcode, clip_or, clip_and, clip_in; int i; /* Check if eye point is contained */ if (point_test_bboxf(bv, camera.position)) return 1; clip_in = clip_or = 0; clip_and = 0xff; for (i=0; i < 8; ++i) { corner[i][0] = bv[corner_index[i][0]]; corner[i][1] = bv[corner_index[i][1]]; corner[i][2] = bv[corner_index[i][2]]; corner[i][3] = 1.0; mat4_vmult(clipmat, corner[i], corner[i]); clipcode = cliptest_point(corner[i]); clip_or |= clipcode; clip_and &= clipcode; if (!clipcode) clip_in = 1; } /* Check for trival acceptance/rejection */ if (clip_and) return 0; if (!clip_or) return 2; if (clip_in) return 1; /* At least one corner in view fustrum */ #if 0 /* FIXME: need something better for this. */ /* Maybe find maximum radius to each corner */ { /* Normalize coordinates */ vec3_t center, rad; float cw; cw = 1.0f/corner[0][3]; vec_scale(corner[0], cw, corner[0]); corner[0][3] = 1.0; cw = 1.0f/corner[6][3]; vec_scale(corner[6], cw, corner[6]); corner[6][3] = 1.0; /* Check for non-trivial acceptance */ vec_avg(corner[0], corner[6], center); vec_sub(corner[0], center, rad); if (sqrt(vec_dot(center, center)) - sqrt(vec_dot(rad, rad)) <= 1.41421356) return 1; } return 0; #endif return 1; }
void BGRA_to_YCbCr_altivec(const unsigned char *bgradata, size_t BGRA_size, unsigned char *pixels) { vector signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128; vector unsigned char z0, tc0, tc1, tc2, tc3; vector signed short tr0, tr1, tg0, tg1, tb0, tb1; vector signed short t0, t1, t2, t3, t4, t5; vector signed short u1, u2, uAvg, v1, v2, vAvg, out1, out2, out3, out4, uv1, uv2; unsigned int i; const vector unsigned char *BGRA_ptr = reinterpret_cast<const vector unsigned char*>( bgradata); vector unsigned char *UYVY_ptr = reinterpret_cast<vector unsigned char*>( pixels); /* Permutation vector is used to extract the interleaved BGRA. */ vector unsigned char vPerm1 = static_cast<vector unsigned char>( 3, 7, 11, 15, 19, 23, 27, 31, // B0..B7 2, 6, 10, 14, 18, 22, 26, 30 /* G0..G7 */); vector unsigned char vPerm2 = static_cast<vector unsigned char>( 1, 5, 9, 13, 17, 21, 25, 29, /* R0..R7 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); /* Load the equation constants. */ vector signed short vConst1 = static_cast<vector signed short>( 8432, 16425, 3176, -4818, -9527, 14345, 0, 0 ); vector signed short vConst2 = static_cast<vector signed short>( 14345, -12045, -2300, 16, 128, 0, 0, 0 ); vector unsigned char avgPerm1 = static_cast<vector unsigned char>( 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 ); vector unsigned char avgPerm2 = static_cast<vector unsigned char>( 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 ); vector unsigned char Perm1 = static_cast<vector unsigned char>( 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ); vector unsigned char Perm2 = static_cast<vector unsigned char>( 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 ); r0 = vec_splat( vConst1, 2 ); /* 8432 */ g0 = vec_splat( vConst1, 1 ); /* 16425 */ b0 = vec_splat( vConst1, 0 ); /* 3176 */ r1 = vec_splat( vConst1, 5 ); /* -4818 */ g1 = vec_splat( vConst1, 4 ); /* -9527 */ b1 = vec_splat( vConst1, 3 ); /* 14345 */ r2 = vec_splat( vConst2, 2 ); /* 14345 */ g2 = vec_splat( vConst2, 1 ); /*-12045 */ b2 = vec_splat( vConst2, 0 ); /* -2300 */ c16 = vec_splat( vConst2, 3 ); /* 16 */ c128 = vec_splat( vConst2, 4 ); /* 128 */ c0 = static_cast<vector signed short> (0); /* 0 */ z0 = static_cast<vector unsigned char> (0); /* 0 */ for ( i = 0; i < (BGRA_size/sizeof(vector unsigned char)); i++ ) { /* Load the 4 BGRA input vectors and seperate into red, green and blue from the interleaved format. */ const vector unsigned char *vec1 = BGRA_ptr++; const vector unsigned char *vec2 = BGRA_ptr++; const vector unsigned char *vec3 = BGRA_ptr++; const vector unsigned char *vec4 = BGRA_ptr++; tc0 = vec_perm( *vec1, *vec2, vPerm1 ); // B0..B7 G0..G7 tc1 = vec_perm( *vec1, *vec2, vPerm2 ); // R0..R7 tc2 = vec_perm( *vec3, *vec4, vPerm1 ); // B8..B15 G8..G15 tc3 = vec_perm( *vec3, *vec4, vPerm2 ); // R8..R15 /* Unpack to 16 bit arithmatic for conversion. */ tr0 = static_cast<vector signed short>(vec_mergeh( z0, tc0 )); /* tr0 = R0 .. R7 */ tg0 = static_cast<vector signed short>(vec_mergel( z0, tc0 )); /* tg0 = G0 .. G7 */ tb0 = static_cast<vector signed short>(vec_mergeh( z0, tc1 )); /* tb0 = B0 .. B7 */ tr1 = static_cast<vector signed short>(vec_mergeh( z0, tc2 )); /* tr0 = R8 .. R15 */ tg1 = static_cast<vector signed short>(vec_mergel( z0, tc2 )); /* tg0 = G8 .. G15 */ tb1 = static_cast<vector signed short>(vec_mergeh( z0, tc3 )); /* tb0 = B8 .. B15 */ /* Convert the first three input vectors. Note that only the top 17 bits of the 32 bit product are stored. This is the same as doing the divide by 32768. */ t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) * 8432 */ t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */ t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */ t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) * 16425 */ t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) * -9527 */ t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */ t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) * 3176 */ t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */ t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */ /* Convert the next three input vectors. */ t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) * 8432 */ t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */ t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */ t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) * 16425 */ t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) * -9527 */ t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */ t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) * 3176 */ t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */ t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */ /* Add the constants. */ t0 = vec_adds( t0, c16 ); t3 = vec_adds( t3, c16 ); t1 = vec_adds( t1, c128 ); t4 = vec_adds( t4, c128 ); t2 = vec_adds( t2, c128 ); t5 = vec_adds( t5, c128 ); u1 = vec_perm( t1, t4, avgPerm1 ); // rearrange U's for averaging u2 = vec_perm( t1, t4, avgPerm2 ); uAvg = vec_avg( u1, u2 ); v1 = vec_perm( t2, t5, avgPerm1 ); // rearrange V's for averaging v2 = vec_perm( t2, t5, avgPerm2 ); vAvg = vec_avg( v1, v2 ); uv1 = vec_perm( uAvg, vAvg, Perm1 ); uv2 = vec_perm( uAvg, vAvg, Perm2 ); out1 = vec_perm( uv1, t0, Perm1 ); out2 = vec_perm( uv1, t0, Perm2 ); out3 = vec_perm( uv2, t3, Perm1 ); out4 = vec_perm( uv2, t3, Perm2 ); *UYVY_ptr = vec_packsu( out1, out2 ); // pack down to char's UYVY_ptr++; *UYVY_ptr = vec_packsu( out3, out4 ); UYVY_ptr++; } }