Ejemplo n.º 1
0
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
                                       register vec_u8_t p1,
                                       register vec_u8_t p2,
                                       register vec_u8_t q0,
                                       register vec_u8_t tc0) {

    register vec_u8_t average = vec_avg(p0, q0);
    register vec_u8_t temp;
    register vec_u8_t uncliped;
    register vec_u8_t ones;
    register vec_u8_t max;
    register vec_u8_t min;
    register vec_u8_t newp1;

    temp = vec_xor(average, p2);
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
    ones = vec_splat_u8(1);
    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
    max = vec_adds(p1, tc0);
    min = vec_subs(p1, tc0);
    newp1 = vec_max(min, uncliped);
    newp1 = vec_min(max, newp1);
    return newp1;
}
Ejemplo n.º 2
0
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
                                                   register vector unsigned char p1,
                                                   register vector unsigned char p2,
                                                   register vector unsigned char q0,
                                                   register vector unsigned char tc0) {

    register vector unsigned char average = vec_avg(p0, q0);
    register vector unsigned char temp;
    register vector unsigned char uncliped;
    register vector unsigned char ones;
    register vector unsigned char max;
    register vector unsigned char min;
    register vector unsigned char newp1;

    temp = vec_xor(average, p2);
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
    ones = vec_splat_u8(1);
    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
    max = vec_adds(p1, tc0);
    min = vec_subs(p1, tc0);
    newp1 = vec_max(min, uncliped);
    newp1 = vec_min(max, newp1);
    return newp1;
}
Ejemplo n.º 3
0
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
{
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
    int i;

   for (i = 0; i < h; i++) {
       /* block is 8 bytes-aligned, so we're either in the
          left block (16 bytes-aligned) or in the right block (not) */
       int rightside = ((unsigned long)block & 0x0000000F);

       blockv = vec_ld(0, block);
       pixelsv1 = vec_ld( 0, pixels);
       pixelsv2 = vec_ld(16, pixels);
       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

       if (rightside) {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
       } else {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
       }

       blockv = vec_avg(blockv, pixelsv);

       vec_st(blockv, 0, block);

       pixels += line_size;
       block += line_size;
   }
}
Ejemplo n.º 4
0
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

    /*
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    */
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
        
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
}
Ejemplo n.º 5
0
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1,
        const uint8_t *src2, int dst_stride,
        int src_stride1, int h)
{
    int i;
    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;

    mask_ = vec_lvsl(0, src2);

    for (i = 0; i < h; i++)
    {

        tmp1 = vec_ld(i * src_stride1, src1);
        mask = vec_lvsl(i * src_stride1, src1);
        tmp2 = vec_ld(i * src_stride1 + 15, src1);

        a = vec_perm(tmp1, tmp2, mask);

        tmp1 = vec_ld(i * 16, src2);
        tmp2 = vec_ld(i * 16 + 15, src2);

        b = vec_perm(tmp1, tmp2, mask_);

        tmp1 = vec_ld(0, dst);
        mask = vec_lvsl(0, dst);
        tmp2 = vec_ld(15, dst);

        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));

        edges = vec_perm(tmp2, tmp1, mask);

        align = vec_lvsr(0, dst);

        tmp2 = vec_perm(d, edges, align);
        tmp1 = vec_perm(edges, d, align);

        vec_st(tmp2, 15, dst);
        vec_st(tmp1, 0 , dst);

        dst += dst_stride;
    }
}
Ejemplo n.º 6
0
int main ()
{
  k = vec_add (a1, a2);
  if (!vec_all_eq (addi, k))
    abort ();

  k = vec_avg (a1, a2);
  if (!vec_all_eq (k, avgi))
    abort ();

  h = vec_add (f1, f2);
  if (!vec_all_eq (h, addf))
    abort ();

  return 0;
}
Ejemplo n.º 7
0
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
    register vector unsigned char perm = vec_lvsl(0, pixels);
    int i;

    for (i = 0; i < h; i++) {
        pixelsv1 = vec_ld( 0, pixels);
        pixelsv2 = vec_ld(16,pixels);
        blockv = vec_ld(0, block);
        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
        blockv = vec_avg(blockv,pixelsv);
        vec_st(blockv, 0, (unsigned char*)block);
        pixels+=line_size;
        block +=line_size;
    }
}
Ejemplo n.º 8
0
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]	pix2iv: pix2[1]-pix2[16]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
        
        tv = (vector unsigned char *) &pix2[0];
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));

        tv = (vector unsigned char *) &pix2[1];
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Ejemplo n.º 9
0
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
    vector unsigned char pix2l, pix2r;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for (i = 0; i < h; i++) {
        /* Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
        pix1v  = vec_ld( 0, pix1);
        pix2l  = vec_ld( 0, pix2);
        pix2r  = vec_ld(16, pix2);
        pix2v  = vec_perm(pix2l, pix2r, perm1);
        pix2iv = vec_perm(pix2l, pix2r, perm2);

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Ejemplo n.º 10
0
static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                    const uint8_t * src2, int dst_stride,
                                    int src_stride1, int h)
{
    int i;
    vec_u8 a, b, d, mask_;
#if HAVE_BIGENDIAN
    vec_u8 tmp1, tmp2, mask, edges, align;
    mask_ = vec_lvsl(0, src2);
#endif

    for (i = 0; i < h; i++) {
        a = unaligned_load(i * src_stride1, src1);
        b = load_with_perm_vec(i * 16, src2, mask_);
        d = vec_avg(a, b);
        put_unligned_store(d, dst);
        dst += dst_stride;
    }
}
Ejemplo n.º 11
0
static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                    const uint8_t * src2, int dst_stride,
                                    int src_stride1, int h)
{
    int i;
    vec_u8 a, b, d, mask1, mask2;

#if HAVE_BIGENDIAN
    vec_u8 tmp1, tmp2, mask, edges, align;
    mask1 = vec_lvsl(0, src1);
    mask2 = vec_lvsl(0, src2);
    mask  = vec_lvsl(0, dst);
    align = vec_lvsr(0, dst);
#endif

    mask1 = vec_lvsl(0, src1);
    for (i = 0; i < h; i++) {
        a = load_with_perm_vec(i * src_stride1, src1, mask1);
        b = load_with_perm_vec(i * 16, src2, mask2);
        d = vec_avg(a, b);
        avg_unaligned_store_with_mask_align(d, dst, mask, align);
        dst += dst_stride;
    }
}
Ejemplo n.º 12
0
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
    register int i;
    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
    register vector unsigned char blockv, temp1, temp2, blocktemp;
    register vector unsigned short pixelssum1, pixelssum2, temp3;

    register const vector unsigned char vczero = (const vector unsigned char)
                                        vec_splat_u8(0);
    register const vector unsigned short vctwo = (const vector unsigned short)
                                        vec_splat_u16(2);

    temp1 = vec_ld(0, pixels);
    temp2 = vec_ld(16, pixels);
    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
        pixelsv2 = temp2;
    } else {
        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
    }
    pixelsv1 = vec_mergeh(vczero, pixelsv1);
    pixelsv2 = vec_mergeh(vczero, pixelsv2);
    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                         (vector unsigned short)pixelsv2);
    pixelssum1 = vec_add(pixelssum1, vctwo);

    for (i = 0; i < h ; i++) {
        int rightside = ((unsigned long)block & 0x0000000F);
        blockv = vec_ld(0, block);

        temp1 = vec_ld(line_size, pixels);
        temp2 = vec_ld(line_size + 16, pixels);
        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
            pixelsv2 = temp2;
        } else {
            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
        }

        pixelsv1 = vec_mergeh(vczero, pixelsv1);
        pixelsv2 = vec_mergeh(vczero, pixelsv2);
        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                             (vector unsigned short)pixelsv2);
        temp3 = vec_add(pixelssum1, pixelssum2);
        temp3 = vec_sra(temp3, vctwo);
        pixelssum1 = vec_add(pixelssum2, vctwo);
        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);

        if (rightside) {
            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
        } else {
            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
        }

        blockv = vec_avg(blocktemp, blockv);
        vec_st(blockv, 0, block);

        block += line_size;
        pixels += line_size;
    }
}
Ejemplo n.º 13
0
Archivo: merge.c Proyecto: CSRedRat/vlc
void MergeAltivec( void *_p_dest, const void *_p_s1,
                   const void *_p_s2, size_t i_bytes )
{
    uint8_t *p_dest = (uint8_t *)_p_dest;
    uint8_t *p_s1   = (uint8_t *)_p_s1;
    uint8_t *p_s2   = (uint8_t *)_p_s2;
    uint8_t *p_end  = p_dest + i_bytes - 15;

    /* Use C until the first 16-bytes aligned destination pixel */
    while( (uintptr_t)p_dest & 0xF )
    {
        *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
    }

    if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
    {
        /* Unaligned source */
        vector unsigned char s1v, s2v, destv;
        vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
        vector unsigned char perm1v, perm2v;

        perm1v = vec_lvsl( 0, p_s1 );
        perm2v = vec_lvsl( 0, p_s2 );
        s1oldv = vec_ld( 0, p_s1 );
        s2oldv = vec_ld( 0, p_s2 );

        while( p_dest < p_end )
        {
            s1newv = vec_ld( 16, p_s1 );
            s2newv = vec_ld( 16, p_s2 );
            s1v    = vec_perm( s1oldv, s1newv, perm1v );
            s2v    = vec_perm( s2oldv, s2newv, perm2v );
            s1oldv = s1newv;
            s2oldv = s2newv;
            destv  = vec_avg( s1v, s2v );
            vec_st( destv, 0, p_dest );

            p_s1   += 16;
            p_s2   += 16;
            p_dest += 16;
        }
    }
    else
    {
        /* Aligned source */
        vector unsigned char s1v, s2v, destv;

        while( p_dest < p_end )
        {
            s1v   = vec_ld( 0, p_s1 );
            s2v   = vec_ld( 0, p_s2 );
            destv = vec_avg( s1v, s2v );
            vec_st( destv, 0, p_dest );

            p_s1   += 16;
            p_s2   += 16;
            p_dest += 16;
        }
    }

    p_end += 15;

    while( p_dest < p_end )
    {
        *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
    }
}
Ejemplo n.º 14
0
/* Test bounding box for intersection with view fustrum.
 * Return val:   0 = reject
 *               1 = accept
 *               2 = trivially accept (entirely in fustrum)
 */
int cliptest_bboxf(bboxf_t bv)
{
//	return 2;
    static int corner_index[8][3] =
    {
		{0, 1, 2}, {3, 1, 2}, {3, 4, 2}, {0, 4, 2},
		{0, 1, 5}, {3, 1, 5}, {3, 4, 5}, {0, 4, 5}
    };

    vec4_t corner[8];
    int clipcode, clip_or, clip_and, clip_in;
    int i;

    /* Check if eye point is contained */
	if (point_test_bboxf(bv, camera.position))
		return 1;
    
    clip_in = clip_or = 0; clip_and = 0xff;
    for (i=0; i < 8; ++i)
    {
		corner[i][0] = bv[corner_index[i][0]];
		corner[i][1] = bv[corner_index[i][1]];
		corner[i][2] = bv[corner_index[i][2]];
		corner[i][3] = 1.0;
		
		mat4_vmult(clipmat, corner[i], corner[i]);
		clipcode = cliptest_point(corner[i]);
		clip_or |= clipcode;
		clip_and &= clipcode;
		if (!clipcode) clip_in = 1;
    }

    /* Check for trival acceptance/rejection */
    if (clip_and) return 0;
    if (!clip_or) return 2;
    if (clip_in) return 1;   /* At least one corner in view fustrum */
	
#if 0
    /* FIXME: need something better for this. */
    /* Maybe find maximum radius to each corner */
    {
		/* Normalize coordinates */
		vec3_t center, rad;
		float cw;
		
		cw = 1.0f/corner[0][3];
		vec_scale(corner[0], cw, corner[0]);
		corner[0][3] = 1.0;
		cw = 1.0f/corner[6][3];
		vec_scale(corner[6], cw, corner[6]);
		corner[6][3] = 1.0;
		
		/* Check for non-trivial acceptance */
		vec_avg(corner[0], corner[6], center);
		vec_sub(corner[0], center, rad);
		if (sqrt(vec_dot(center, center)) -
			sqrt(vec_dot(rad, rad)) <= 1.41421356)
			return 1;
    }
	
    return 0;
#endif
    return 1;
}
Ejemplo n.º 15
0
void BGRA_to_YCbCr_altivec(const unsigned char *bgradata, size_t BGRA_size,
                           unsigned char *pixels)
{
  vector signed short  r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128;
  vector unsigned char z0, tc0, tc1, tc2, tc3;
  vector signed short tr0, tr1, tg0, tg1, tb0, tb1;
  vector signed short t0, t1, t2, t3, t4, t5;
  vector signed short u1, u2, uAvg, v1, v2, vAvg, out1, out2, out3, out4, uv1, uv2;
  unsigned int i;

  const vector unsigned char	*BGRA_ptr = reinterpret_cast<const vector unsigned char*>( bgradata);
  vector unsigned char	*UYVY_ptr = reinterpret_cast<vector unsigned char*>( pixels);

  /* Permutation vector is used to extract the interleaved BGRA. */
  vector unsigned char vPerm1 =
    static_cast<vector unsigned char>( 3,  7, 11, 15, 19, 23, 27, 31, // B0..B7
                            2,  6, 10, 14, 18, 22, 26, 30  /* G0..G7    */);
  vector unsigned char vPerm2 =
    static_cast<vector unsigned char>( 1,  5,  9, 13, 17, 21, 25, 29, /* R0..R7    */
                            0,  0,  0,  0,  0,  0,  0,  0  /* dont care */);

  /* Load the equation constants. */
  vector signed short vConst1 =
    static_cast<vector signed short>( 8432,  16425,  3176,
                           -4818,  -9527, 14345,
                           0,      0 );
  vector signed short vConst2 =
    static_cast<vector signed short>( 14345, -12045, -2300,
                           16, 128, 0, 0, 0 );

  vector unsigned char avgPerm1 =
    static_cast<vector unsigned char>(  0,  1,  4,  5,  8,  9, 12, 13,
                             16, 17, 20, 21, 24, 25, 28, 29 );
  vector unsigned char avgPerm2 =
    static_cast<vector unsigned char>(  2,  3,  6,  7, 10, 11, 14, 15,
                             18, 19, 22, 23, 26, 27, 30, 31 );
  vector unsigned char Perm1 =
    static_cast<vector unsigned char>( 0, 1, 16, 17, 2, 3, 18, 19,
                            4, 5, 20, 21, 6, 7, 22, 23 );
  vector unsigned char Perm2 =
    static_cast<vector unsigned char>(  8,  9, 24, 25, 10, 11, 26, 27,
                             12, 13, 28, 29, 14, 15, 30, 31 );

  r0 = vec_splat( vConst1, 2 ); /*  8432 */
  g0 = vec_splat( vConst1, 1 ); /* 16425 */
  b0 = vec_splat( vConst1, 0 ); /*  3176 */
  r1 = vec_splat( vConst1, 5 ); /* -4818 */
  g1 = vec_splat( vConst1, 4 ); /* -9527 */
  b1 = vec_splat( vConst1, 3 ); /* 14345 */
  r2 = vec_splat( vConst2, 2 ); /* 14345 */
  g2 = vec_splat( vConst2, 1 ); /*-12045 */
  b2 = vec_splat( vConst2, 0 ); /* -2300 */
  c16  = vec_splat( vConst2, 3 ); /*  16 */
  c128 = vec_splat( vConst2, 4 ); /* 128 */
  c0 = static_cast<vector signed short> (0); /*   0 */
  z0 = static_cast<vector unsigned char> (0); /*  0 */

  for ( i = 0; i < (BGRA_size/sizeof(vector unsigned char)); i++ ) {

    /* Load the 4 BGRA input vectors and seperate into red,
       green and blue from the interleaved format. */
    const vector unsigned char *vec1 = BGRA_ptr++;
    const vector unsigned char *vec2 = BGRA_ptr++;
    const vector unsigned char *vec3 = BGRA_ptr++;
    const vector unsigned char *vec4 = BGRA_ptr++;

    tc0 = vec_perm( *vec1, *vec2, vPerm1 ); // B0..B7  G0..G7
    tc1 = vec_perm( *vec1, *vec2, vPerm2 ); // R0..R7
    tc2 = vec_perm( *vec3, *vec4, vPerm1 ); // B8..B15 G8..G15
    tc3 = vec_perm( *vec3, *vec4, vPerm2 ); // R8..R15

    /* Unpack to 16 bit arithmatic for conversion. */
    tr0 = static_cast<vector signed short>(vec_mergeh( z0, tc0 ));  /* tr0 = R0 .. R7  */
    tg0 = static_cast<vector signed short>(vec_mergel( z0, tc0 ));  /* tg0 = G0 .. G7  */
    tb0 = static_cast<vector signed short>(vec_mergeh( z0, tc1 ));  /* tb0 = B0 .. B7  */
    tr1 = static_cast<vector signed short>(vec_mergeh( z0, tc2 ));  /* tr0 = R8 .. R15 */
    tg1 = static_cast<vector signed short>(vec_mergel( z0, tc2 ));  /* tg0 = G8 .. G15 */
    tb1 = static_cast<vector signed short>(vec_mergeh( z0, tc3 ));  /* tb0 = B8 .. B15 */

    /* Convert the first three input vectors.  Note that
       only the top 17 bits of the 32 bit product are
       stored.  This is the same as doing the divide by 32768. */

    t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) *  8432 */
    t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */
    t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */

    t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) *  16425 */
    t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) *  -9527 */
    t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */

    t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) *  3176 */
    t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */
    t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */

    /* Convert the next three input vectors. */
    t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) *  8432 */
    t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */
    t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */

    t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) *  16425 */
    t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) *  -9527 */
    t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */

    t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) *  3176 */
    t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */
    t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */

    /* Add the constants. */
    t0 = vec_adds( t0, c16 );
    t3 = vec_adds( t3, c16 );
    t1 = vec_adds( t1, c128 );
    t4 = vec_adds( t4, c128 );
    t2 = vec_adds( t2, c128 );
    t5 = vec_adds( t5, c128 );

    u1 = vec_perm( t1, t4, avgPerm1 ); // rearrange U's for averaging
    u2 = vec_perm( t1, t4, avgPerm2 );
    uAvg = vec_avg( u1, u2 );
    v1 = vec_perm( t2, t5, avgPerm1 ); // rearrange V's for averaging
    v2 = vec_perm( t2, t5, avgPerm2 );
    vAvg = vec_avg( v1, v2 );

    uv1 = vec_perm( uAvg, vAvg, Perm1 );
    uv2 = vec_perm( uAvg, vAvg, Perm2 );
    out1 = vec_perm( uv1, t0, Perm1 );
    out2 = vec_perm( uv1, t0, Perm2 );
    out3 = vec_perm( uv2, t3, Perm1 );
    out4 = vec_perm( uv2, t3, Perm2 );

    *UYVY_ptr = vec_packsu( out1, out2 );	// pack down to char's
    UYVY_ptr++;
    *UYVY_ptr = vec_packsu( out3, out4 );
    UYVY_ptr++;
  }
}