Beispiel #1
0
void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
{
    vec_s16_t diff[2];
    vec_s32_t sum[2];
    vec_s32_t zero32 = vec_splat_s32(0);
    vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };

    pix_diff( &pix1[0], &pix2[0], diff, 0 );
    pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 );

    sum[0] = vec_sum4s( diff[0], zero32 );
    sum[1] = vec_sum4s( diff[1], zero32 );
    diff[0] = vec_packs( sum[0], sum[1] );
    sum[0] = vec_sum4s( diff[0], zero32 );
    diff[0] = vec_packs( sum[0], zero32 );

    diff[1] = vec_vsx_ld( 0, dct );
    diff[0] = vec_perm( diff[0], diff[1], mask );

    vec_vsx_st( diff[0], 0, dct );

    /* 2x2 DC transform */
    int d0 = dct[0] + dct[1];
    int d1 = dct[2] + dct[3];
    int d2 = dct[0] - dct[1];
    int d3 = dct[2] - dct[3];
    dct[0] = d0 + d1;
    dct[1] = d0 - d1;
    dct[2] = d2 + d3;
    dct[3] = d2 - d3;
}
Beispiel #2
0
static int pix_sum_altivec(uint8_t * pix, int line_size)
{
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char t1;
    vector unsigned int sad;
    vector signed int sumdiffs;

    int i;
    int s;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1 */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        t1 = vec_perm(pixl, pixr, perm);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t1, sad);

        pix += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #3
0
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

    /*
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    */
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
        
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
}
Beispiel #4
0
inline int v_signmask(const v_int16x8& a)
{
    static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
    vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
    sv = vec_sl(sv, slm);
    vec_int4 svi = vec_int4_z;
    svi = vec_sums(vec_sum4s(sv, svi), svi);
    return vec_extract(svi, 3);
}
Beispiel #5
0
uint32_t
quant_h263_inter_altivec_c(int16_t *coeff,
                            int16_t *data,
                            const uint32_t quant,
                            const uint16_t *mpeg_quant_matrices)
{
    vector unsigned char zerovec;
    vector unsigned short mult;
    vector unsigned short quant_m_2;
    vector unsigned short quant_d_2;
    vector unsigned short sum_short;
    vector signed short acLevel;
    
    vector unsigned int even;
    vector unsigned int odd;
    
    vector bool short m2_mask;
    vector bool short zero_mask;
    
    uint32_t result;

#ifdef DEBUG
    if(((unsigned)coeff) & 0x15)
        fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff);
#endif
    
    /* initialisation stuff */
    zerovec = vec_splat_u8(0);
    *((unsigned short*)&mult) = (unsigned short)multipliers[quant];
    mult = vec_splat(mult, 0);
    *((unsigned short*)&quant_m_2) = (unsigned short)quant;
    quant_m_2 = vec_splat(quant_m_2, 0);
    quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1));
    *((unsigned short*)&quant_d_2) = (unsigned short)quant;
    quant_d_2 = vec_splat(quant_d_2, 0);
    quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1));
    sum_short = (vector unsigned short)zerovec;
    
    /* Quantize */
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
        
    /* Calculate the return value */
    even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec);
    even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec);
    even = vec_splat(even, 3);
    vec_ste(even, 0, &result);
    return result;
}
Beispiel #6
0
/** Mask **/
inline int v_signmask(const v_uint8x16& a)
{
    vec_uchar16 sv  = vec_sr(a.val, vec_uchar16_sp(7));
    static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
    sv = vec_sl(sv, slm);
    vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
    static const vec_uint4 slm4 = {0, 0, 8, 8};
    sv4 = vec_sl(sv4, slm4);
    return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
}
Beispiel #7
0
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]	pix2iv: pix2[1]-pix2[16]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
        
        tv = (vector unsigned char *) &pix2[0];
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));

        tv = (vector unsigned char *) &pix2[1];
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #8
0
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #9
0
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
{
  vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; 
  vector signed int sumdiffs;
  int tmpsad, sad = 0;
  
  int y, x;
  
  for (y = 0; y < height; ++y) {
    vector unsigned char perm1, perm2;
    
    perm1 = vec_lvsl(0, &data1[y * stride1]);
    perm2 = vec_lvsl(0, &data2[y * stride2]);
    
    for (x = 0; x <= width-16; x+=16) {
      vector unsigned char t1, t2, t3, t4, t5;
      vector unsigned char *current, *previous;
      
      current = (vector unsigned char *) &data1[y * stride1 + x];
      previous = (vector unsigned char *) &data2[y * stride2 + x];
      
      t1  = vec_perm(current[0], current[1], perm1 );  /* align current vector  */ 
      t2  = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ 
      t3  = vec_max(t1, t2 );      /* find largest of two           */ 
      t4  = vec_min(t1, t2 );      /* find smaller of two           */ 
      t5  = vec_sub(t3, t4);       /* find absolute difference      */ 
      vsad = vec_sum4s(t5, vsad);    /* accumulate sum of differences */
    }

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
    }
  }
  
  sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero);
  /* copy vector sum into unaligned result */
  sumdiffs = vec_splat( sumdiffs, 3);
  vec_ste( sumdiffs, 0, &tmpsad );
  sad += tmpsad;
  
  return sad;
}
Beispiel #10
0
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
    vector unsigned char pix2l, pix2r;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for (i = 0; i < h; i++) {
        /* Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
        pix1v  = vec_ld( 0, pix1);
        pix2l  = vec_ld( 0, pix2);
        pix2r  = vec_ld(16, pix2);
        pix2v  = vec_perm(pix2l, pix2r, perm1);
        pix2iv = vec_perm(pix2l, pix2r, perm2);

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #11
0
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    
    sad = (vector unsigned int)vec_splat_u32(0);


    for(i=0;i<16;i++) {
	/* Read potentially unaligned pixels into t1 and t2 */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
       
	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
	
	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    
    return s;
}
Beispiel #12
0
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);


    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2 */
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_ld(0, pix1);
        t2 = vec_perm(pix2l, pix2r, perm);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #13
0
int bsad_altivec(BSAD_PDECL)
{
    int i;
    uint8_t *pfy, *pby;
    vector unsigned char l0, l1, lR;
    vector unsigned char permF0, permF1, permB0, permB1;
    vector unsigned char vf, vfa, vfb, vfc;
    vector unsigned char vb, vba, vbb, vbc;
    vector unsigned short tH, tL, fH, fL, bH, bL;
    vector unsigned char zero;
    vector unsigned short one, two;
    vector unsigned char max, min, dif;
    vector unsigned int sum;
    union {
	vector signed int v;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vo;



#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(p2))
	mjpeg_error_exit1("bsad: p2 %% 16 != 0, (0x%X)", p2);
    if (NOT_VECTOR_ALIGNED(rowstride))
	mjpeg_error_exit1("bsad: rowstride %% 16 != 0, (%d)", rowstride);

    if (hxf != 0 && hxf != 1)
	mjpeg_error_exit1("bsad: hxf != [0|1], (hxf=%d)", hxf);
    if (hyf != 0 && hyf != 1)
	mjpeg_error_exit1("bsad: hyf != [0|1], (hyf=%d)", hyf);
    if (hxb != 0 && hxb != 1)
	mjpeg_error_exit1("bsad: hxb != [0|1], (hxb=%d)", hxb);
    if (hyb != 0 && hyb != 1)
	mjpeg_error_exit1("bsad: hyb != [0|1], (hyb=%d)", hyb);
#endif

    if (h != 8 && h != 16)
	mjpeg_error_exit1("bsad: h != [8|16], (%d)", h);

    AMBER_START;


    /* start loading first set  */
    vfb = vec_ld(0, pf);	 /* use vfb & vfc as temp for vf & vfa */
    vfc = vec_ld(16, pf);

    pfy = pf + (rowstride * hyf);
    l0 = vec_ld(0, pfy);
    l1 = vec_ld(16, pfy);


    pby = pb + (rowstride * hyb);


    zero  = vec_splat_u8(0);
    one = vec_splat_u16(1);
    two = vec_splat_u16(2);

    sum = vec_splat_u32(0);


    permF0 = vec_lvsl(0, pf);
    permF1 = vec_lvsl(hxf, (unsigned char*)0);
    permF1 = vec_splat(permF1, 0);
    permF1 = vec_add(permF0, permF1);

    permB0 = vec_lvsl(0, pb);
    permB1 = vec_lvsl(hxb, (unsigned char*)0);
    permB1 = vec_splat(permB1, 0);
    permB1 = vec_add(permB0, permB1);


    i = h - 1;
    do { /* while (--i) */

	vf = vec_perm(vfb, vfc, permF0);
	vfa = vec_perm(vfb, vfc, permF1);
	vfb = vec_perm(l0, l1, permF0);
	vfc = vec_perm(l0, l1, permF1);

	vbb = vec_ld(0, pb);	 /* use vbb & vbc as temp for vb & vba */
	vbc = vec_ld(16, pb);
	l0 = vec_ld(0, pby);
	l1 = vec_ld(16, pby);

	pb += rowstride;
	pby += rowstride;

	/* (unsigned short[]) pf[0-7] */
	fH = vu16(vec_mergeh(zero, vf));

	/* (unsigned short[]) pf[8-15] */
	fL = vu16(vec_mergel(zero, vf));

	/* (unsigned short[]) pfa[0-7] */
	tH = vu16(vec_mergeh(zero, vfa));

	/* (unsigned short[]) pfa[8-15] */
	tL = vu16(vec_mergel(zero, vfa));

	/* pf[i] + pfa[i] */
	fH = vec_add(fH, tH);
	fL = vec_add(fL, tL);

	/* (unsigned short[]) pfb[0-7] */
	tH = vu16(vec_mergeh(zero, vfb));

	/* (unsigned short[]) pfb[8-15] */
	tL = vu16(vec_mergel(zero, vfb));

	/* (pf[i]+pfa[i]) + pfb[i] */
	fH = vec_add(fH, tH);
	fL = vec_add(fL, tL);

	/* (unsigned short[]) pfc[0-7] */
	tH = vu16(vec_mergeh(zero, vfc));

	/* (unsigned short[]) pfc[8-15] */
	tL = vu16(vec_mergel(zero, vfc));

	/* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
	fH = vec_add(fH, tH);
	fL = vec_add(fL, tL);


	/* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
	fH = vec_add(fH, two);
	fL = vec_add(fL, two);

	/* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
	fH = vec_sra(fH, two);
	fL = vec_sra(fL, two);


	lR = vec_ld(0, p2);
	p2 += rowstride;

	vb = vec_perm(vbb, vbc, permB0);
	vba = vec_perm(vbb, vbc, permB1);
	vbb = vec_perm(l0, l1, permB0);
	vbc = vec_perm(l0, l1, permB1);


	pf += rowstride;
	vfb = vec_ld(0, pf);	 /* use vfb & vfc as temp for vf & vfa */
	vfc = vec_ld(16, pf);
	pfy += rowstride;
	l0 = vec_ld(0, pfy);
	l1 = vec_ld(16, pfy);


	/* (unsigned short[]) pb[0-7] */
	bH = vu16(vec_mergeh(zero, vb));

	/* (unsigned short[]) pb[8-15] */
	bL = vu16(vec_mergel(zero, vb));

	/* (unsigned short[]) pba[0-7] */
	tH = vu16(vec_mergeh(zero, vba));

	/* (unsigned short[]) pba[8-15] */
	tL = vu16(vec_mergel(zero, vba));

	/* pb[i] + pba[i] */
	bH = vec_add(bH, tH);
	bL = vec_add(bL, tL);

	/* (unsigned short[]) pbb[0-7] */
	tH = vu16(vec_mergeh(zero, vbb));

	/* (unsigned short[]) pbb[8-15] */
	tL = vu16(vec_mergel(zero, vbb));

	/* (pb[i]+pba[i]) + pbb[i] */
	bH = vec_add(bH, tH);
	bL = vec_add(bL, tL);

	/* (unsigned short[]) pbc[0-7] */
	tH = vu16(vec_mergeh(zero, vbc));

	/* (unsigned short[]) pbc[8-15] */
	tL = vu16(vec_mergel(zero, vbc));

	/* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
	bH = vec_add(bH, tH);
	bL = vec_add(bL, tL);


	/* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
	bH = vec_add(bH, two);
	bL = vec_add(bL, two);

	/* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
	bH = vec_sra(bH, two);
	bL = vec_sra(bL, two);



	/* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
	 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
         */
	tH = vec_add(fH, bH);
	tL = vec_add(fL, bL);


	/* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
	 *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
         */
	tH = vec_add(tH, one);
	tL = vec_add(tL, one);


	/* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
	 *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
         */
	tH = vec_sra(tH, one);
	tL = vec_sra(tL, one);


	/* d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
	 *            ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
         */
	tH = vu16(vec_packsu(tH, tL));
	min = vec_min(vu8(tH), lR);
	max = vec_max(vu8(tH), lR);
	dif = vec_sub(max, min);

	/* sum += d */
	sum = vec_sum4s(dif, sum);

    } while (--i);


    vf = vec_perm(vfb, vfc, permF0);
    vfa = vec_perm(vfb, vfc, permF1);
    vfb = vec_perm(l0, l1, permF0);
    vfc = vec_perm(l0, l1, permF1);

    vbb = vec_ld(0, pb);	 /* use vbb & vbc as temp for vb & vba */
    vbc = vec_ld(16, pb);
    l0 = vec_ld(0, pby);
    l1 = vec_ld(16, pby);


    /* (unsigned short[]) pf[0-7] */
    fH = vu16(vec_mergeh(zero, vf));

    /* (unsigned short[]) pf[8-15] */
    fL = vu16(vec_mergel(zero, vf));

    /* (unsigned short[]) pfa[0-7] */
    tH = vu16(vec_mergeh(zero, vfa));

    /* (unsigned short[]) pfa[8-15] */
    tL = vu16(vec_mergel(zero, vfa));

    /* pf[i] + pfa[i] */
    fH = vec_add(fH, tH);
    fL = vec_add(fL, tL);

    /* (unsigned short[]) pfb[0-7] */
    tH = vu16(vec_mergeh(zero, vfb));

    /* (unsigned short[]) pfb[8-15] */
    tL = vu16(vec_mergel(zero, vfb));

    /* (pf[i]+pfa[i]) + pfb[i] */
    fH = vec_add(fH, tH);
    fL = vec_add(fL, tL);

    /* (unsigned short[]) pfc[0-7] */
    tH = vu16(vec_mergeh(zero, vfc));

    /* (unsigned short[]) pfc[8-15] */
    tL = vu16(vec_mergel(zero, vfc));

    /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
    fH = vec_add(fH, tH);
    fL = vec_add(fL, tL);


    /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
    fH = vec_add(fH, two);
    fL = vec_add(fL, two);

    /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
    fH = vec_sra(fH, two);
    fL = vec_sra(fL, two);


    lR = vec_ld(0, p2);


    vb = vec_perm(vbb, vbc, permB0);
    vba = vec_perm(vbb, vbc, permB1);
    vbb = vec_perm(l0, l1, permB0);
    vbc = vec_perm(l0, l1, permB1);


    /* (unsigned short[]) pb[0-7] */
    bH = vu16(vec_mergeh(zero, vb));

    /* (unsigned short[]) pb[8-15] */
    bL = vu16(vec_mergel(zero, vb));

    /* (unsigned short[]) pba[0-7] */
    tH = vu16(vec_mergeh(zero, vba));

    /* (unsigned short[]) pba[8-15] */
    tL = vu16(vec_mergel(zero, vba));

    /* pb[i] + pba[i] */
    bH = vec_add(bH, tH);
    bL = vec_add(bL, tL);

    /* (unsigned short[]) pbb[0-7] */
    tH = vu16(vec_mergeh(zero, vbb));

    /* (unsigned short[]) pbb[8-15] */
    tL = vu16(vec_mergel(zero, vbb));

    /* (pb[i]+pba[i]) + pbb[i] */
    bH = vec_add(bH, tH);
    bL = vec_add(bL, tL);

    /* (unsigned short[]) pbc[0-7] */
    tH = vu16(vec_mergeh(zero, vbc));

    /* (unsigned short[]) pbc[8-15] */
    tL = vu16(vec_mergel(zero, vbc));

    /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
    bH = vec_add(bH, tH);
    bL = vec_add(bL, tL);


    /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
    bH = vec_add(bH, two);
    bL = vec_add(bL, two);

    /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
    bH = vec_sra(bH, two);
    bL = vec_sra(bL, two);



    /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
     * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
     */
    tH = vec_add(fH, bH);
    tL = vec_add(fL, bL);


    /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
     *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
     */
    tH = vec_add(tH, one);
    tL = vec_add(tL, one);


    /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
     *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
     */
    tH = vec_sra(tH, one);
    tL = vec_sra(tL, one);


    /* d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
     *            ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
     */
    tH = vu16(vec_packsu(tH, tL));
    min = vec_min(vu8(tH), lR);
    max = vec_max(vu8(tH), lR);
    dif = vec_sub(max, min);

    /* sum += d */
    sum = vec_sum4s(dif, sum);



    /* sum all parts of difference into one 32 bit quantity */
    vo.v = vec_sums(vs32(sum), vs32(zero));

    AMBER_STOP;

    return vo.s.sum;
}
Beispiel #14
0
void iquant_intra_m2_altivec(IQUANT_INTRA_PDECL)
{
    int i;
    vector signed short vsrc;
    uint16_t *qmat;
    vector unsigned short vqmat;
    vector unsigned short vmquant;
    vector bool short ltzero;
    vector signed short val, t0;
    vector signed short zero;
    vector unsigned int four;
    vector signed short min, max;
    vector signed int vsum;
    int sum;
    int offset, offset2;
    int16_t dst0;
    union {
	vector unsigned short vu16;
	unsigned short mquant;
	vector signed int vs32;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vu;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY /* {{{ */
    if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat))
	mjpeg_error_exit1("iquant_intra_m2: wsp->intra_q_mat %% 16 != 0, (%d)",
	    wsp->intra_q_mat);

    if (NOT_VECTOR_ALIGNED(src))
	mjpeg_error_exit1("iquant_intra_m2: src %% 16 != 0, (%d)", src);

    if (NOT_VECTOR_ALIGNED(dst))
	mjpeg_error_exit1("iquant_intra_m2: dst %% 16 != 0, (%d)", dst);

    for (i = 0; i < 64; i++)
	if (src[i] < -256 || src[i] > 255)
	    mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)",
		i, src[i]);
#endif /* }}} */

    AMBER_START;

    dst0 = src[0] << (3 - dc_prec);

    qmat = (uint16_t*)wsp->intra_q_mat;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(64/8,1,0);
    vec_dst(src, dsc.control, 0);
    vec_dst(qmat, dsc.control, 1);
#endif

    /* vmquant = (vector unsigned short)(mquant); */
    vu.mquant = (unsigned short)mquant;
    vmquant = vec_splat(vu.vu16, 0);

    vsum = vec_splat_s32(0);
    zero = vec_splat_s16(0);
    four = vec_splat_u32(4);
    /* max = (2047); min = (-2048); {{{ */
    vu8(max) = vec_splat_u8(0x7);
    t0 = vec_splat_s16(-1); /* 0xffff */
    vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */
    min = vec_sub(t0, max);
    /* }}} */
    offset = 0;

#if 1
    vsrc = vec_ld(offset, (signed short*)src);
    vqmat = vec_ld(offset, (unsigned short*)qmat);
    i = (64/8) - 1;
    do {
	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	offset2 = offset;
	offset += 8*sizeof(int16_t);
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vsum = vec_sum4s(val, vsum);

	vec_st(val, offset2, dst);
    } while (--i);
    /* intra_q[i] * mquant */
    vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

    /* save sign */
    ltzero = vec_cmplt(vsrc, zero);

    /* val = abs(src) */
    t0 = vec_sub(zero, vsrc);
    val = vec_max(t0, vsrc);

    /* val = (src * quant) >> 4 */
    vs32(t0) = vec_mule(val, vs16(vqmat));
    vs32(val) = vec_mulo(val, vs16(vqmat));
    vs32(t0) = vec_sra(vs32(t0), four);
    vs16(t0) = vec_pack(vs32(t0), vs32(t0));
    vs32(val) = vec_sra(vs32(val), four);
    vs16(val) = vec_pack(vs32(val), vs32(val));
    val = vec_mergeh(vs16(t0), vs16(val));

    /* restore sign */
    t0 = vec_sub(zero, val);
    val = vec_sel(val, t0, ltzero);

    /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
    val = vec_min(val, max);
    val = vec_max(val, min);

    vsum = vec_sum4s(val, vsum);

    vec_st(val, offset, dst);
#else
    /* {{{ */
    i = (64/8);
    do {
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vsum = vec_sum4s(val, vsum);

	vec_st(val, offset, dst);

	offset += 8*sizeof(int16_t);
    } while (--i);
    /* }}} */
#endif

    vu.vs32 = vec_sums(vsum, vs32(zero));
    sum = vu.s.sum;
    sum -= dst[0];
    sum += dst0;
    dst[0] = dst0;

    /* mismatch control */
#if 1
    dst[63] ^= !(sum & 1);
#else
    if ((sum & 1) == 0)
	dst[63] ^= 1;
#endif

    AMBER_STOP;
}
Beispiel #15
0
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
    vector unsigned char perm1 = vec_lvsl(0, pix1);
    vector unsigned char perm2 = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2
           Since we're reading 16 pixels, and actually only want 8,
           mask out the last 8 pixels. The 0s don't change the sum. */
        vector unsigned char pix1l = vec_ld( 0, pix1);
        vector unsigned char pix1r = vec_ld(15, pix1);
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
        t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;

    sv = (vector unsigned int)vec_splat_u32(0);

    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Beispiel #16
0
static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    uint8_t *pix3 = pix2 + line_size;
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
    vector unsigned char avgv, t5;
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
    vector unsigned char pix2l, pix2r;
    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
    vector unsigned short avghv, avglv;
    vector unsigned short t1, t2, t3, t4;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    s = 0;

    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, as well
       as some splitting, and vector addition each time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
       Split the pixel vectors into shorts */
    pix2l  = vec_ld( 0, pix2);
    pix2r  = vec_ld(16, pix2);
    pix2v  = vec_perm(pix2l, pix2r, perm1);
    pix2iv = vec_perm(pix2l, pix2r, perm2);

    pix2hv  = (vector unsigned short) vec_mergeh(zero, pix2v);
    pix2lv  = (vector unsigned short) vec_mergel(zero, pix2v);
    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
    t1 = vec_add(pix2hv, pix2ihv);
    t2 = vec_add(pix2lv, pix2ilv);

    for (i = 0; i < h; i++) {
        /* Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16] */
        pix1v = vec_ld(0, pix1);

        pix2l  = vec_ld( 0, pix3);
        pix2r  = vec_ld(16, pix3);
        pix3v  = vec_perm(pix2l, pix2r, perm1);
        pix3iv = vec_perm(pix2l, pix2r, perm2);

        /* Note that AltiVec does have vec_avg, but this works on vector pairs
           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
           Instead, we have to split the pixel vectors into vectors of shorts,
           and do the averaging by hand. */

        /* Split the pixel vectors into shorts */
        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);

        /* Do the averaging on them */
        t3 = vec_add(pix3hv, pix3ihv);
        t4 = vec_add(pix3lv, pix3ilv);

        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);

        /* Pack the shorts back into a result */
        avgv = vec_pack(avghv, avglv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix3 += line_size;
        /* Transfer the calculated values for pix3 into pix2 */
        t1 = t3;
        t2 = t4;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Beispiel #17
0
int field_dct_best_altivec(FIELD_DCT_BEST_PDECL)
{
    /*
     * calculate prediction error (cur-pred) for top (blk0)
     * and bottom field (blk1)
     */
    double r, d;
    int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop;
    int topvar, botvar;
    int whichdct;

    int i;
    vector unsigned char ct, pt, cb, pb;
    vector unsigned char *ctp, *ptp, *cbp, *pbp;
    unsigned int offset, stride2;
    vector signed short cur, pred;
    vector signed short dift, difb;
    vector signed int vsumtop, vsumbot, vsumsqtop, vsumsqbot, vsumbottop;
    vector signed int t0, t1, t2, t3;
    vector signed int zero;
    union {
	vector signed int v;
	struct {
	    signed int top;
	    signed int bot;
	    signed int sqtop;
	    signed int sqbot;
	} sum;
	struct {
	    signed int pad[3];
	    signed int sum;
	} bottop;
    } vo;


    AMBER_START;


#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(cur_lum_mb))
	mjpeg_error_exit1("field_dct_best: cur_lum_mb %% 16 != 0, (%d)\n",
	    cur_lum_mb);

    if (NOT_VECTOR_ALIGNED(pred_lum_mb))
	mjpeg_error_exit1("field_dct_best: pred_lum_mb %% 16 != 0, (%d)\n",
	    pred_lum_mb);

    if (NOT_VECTOR_ALIGNED(stride))
	mjpeg_error_exit1("field_dct_best: stride %% 16 != 0, (%d)\n", stride);
#endif


    zero = vec_splat_s32(0);
    vsumtop = vec_splat_s32(0);
    vsumbot = vec_splat_s32(0);
    vsumsqtop = vec_splat_s32(0);
    vsumsqbot = vec_splat_s32(0);
    vsumbottop = vec_splat_s32(0);

    ctp = (vector unsigned char*) cur_lum_mb;
    ptp = (vector unsigned char*) pred_lum_mb;
    cbp = (vector unsigned char*)(cur_lum_mb + stride);
    pbp = (vector unsigned char*)(pred_lum_mb + stride);
    offset = 0;
    stride2 = stride << 1;

#if 1
    ct = vec_ld(offset, ctp);
    pt = vec_ld(offset, ptp);
    cb = vec_ld(offset, cbp);
    pb = vec_ld(offset, pbp);

    i = 16/2 - 1;
    do {
	cur = (vector signed short)vec_mergeh(vu8(zero), ct);
	pred = (vector signed short)vec_mergeh(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergeh(vu8(zero), cb);
	pred = (vector signed short)vec_mergeh(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	cur = (vector signed short)vec_mergel(vu8(zero), ct);
	pred = (vector signed short)vec_mergel(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergel(vu8(zero), cb);
	pred = (vector signed short)vec_mergel(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	offset += stride2;
	ct = vec_ld(offset, ctp);
	pt = vec_ld(offset, ptp);
	cb = vec_ld(offset, cbp);
	pb = vec_ld(offset, pbp);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);
    } while (--i);
    cur = (vector signed short)vec_mergeh(vu8(zero), ct);
    pred = (vector signed short)vec_mergeh(vu8(zero), pt);
    dift = vec_sub(cur, pred);

    cur = (vector signed short)vec_mergeh(vu8(zero), cb);
    pred = (vector signed short)vec_mergeh(vu8(zero), pb);
    difb = vec_sub(cur, pred);

    vsumtop = vec_sum4s(dift, vsumtop);
    vsumbot = vec_sum4s(difb, vsumbot);

    vsumsqtop = vec_msum(dift, dift, vsumsqtop);
    vsumsqbot = vec_msum(difb, difb, vsumsqbot);

    vsumbottop = vec_msum(dift, difb, vsumbottop);

    cur = (vector signed short)vec_mergel(vu8(zero), ct);
    pred = (vector signed short)vec_mergel(vu8(zero), pt);
    dift = vec_sub(cur, pred);

    cur = (vector signed short)vec_mergel(vu8(zero), cb);
    pred = (vector signed short)vec_mergel(vu8(zero), pb);
    difb = vec_sub(cur, pred);

    vsumtop = vec_sum4s(dift, vsumtop);
    vsumbot = vec_sum4s(difb, vsumbot);

    vsumsqtop = vec_msum(dift, dift, vsumsqtop);
    vsumsqbot = vec_msum(difb, difb, vsumsqbot);

    vsumbottop = vec_msum(dift, difb, vsumbottop);
#else
    for (i = 0; i < 16/2; i++) { /* {{{ */
	ct = vec_ld(offset, ctp);
	pt = vec_ld(offset, ptp);
	cb = vec_ld(offset, cbp);
	pb = vec_ld(offset, pbp);

	cur = (vector signed short)vec_mergeh(vu8(zero), ct);
	pred = (vector signed short)vec_mergeh(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergeh(vu8(zero), cb);
	pred = (vector signed short)vec_mergeh(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	cur = (vector signed short)vec_mergel(vu8(zero), ct);
	pred = (vector signed short)vec_mergel(vu8(zero), pt);
	dift = vec_sub(cur, pred);

	cur = (vector signed short)vec_mergel(vu8(zero), cb);
	pred = (vector signed short)vec_mergel(vu8(zero), pb);
	difb = vec_sub(cur, pred);

	vsumtop = vec_sum4s(dift, vsumtop);
	vsumbot = vec_sum4s(difb, vsumbot);

	vsumsqtop = vec_msum(dift, dift, vsumsqtop);
	vsumsqbot = vec_msum(difb, difb, vsumsqbot);

	vsumbottop = vec_msum(dift, difb, vsumbottop);

	offset += stride2;
    } /* }}} */
#endif

    /* transpose [sumtop, sumbot, sumsqtop, sumsqbot] {{{ */
    t0 = vec_mergel(vsumtop, vsumsqtop);
    t1 = vec_mergeh(vsumtop, vsumsqtop);
    t2 = vec_mergel(vsumbot, vsumsqbot);
    t3 = vec_mergeh(vsumbot, vsumsqbot);
    vsumtop = vec_mergeh(t1, t3);
    vsumbot = vec_mergel(t1, t3);
    vsumsqtop = vec_mergeh(t0, t2);
    vsumsqbot = vec_mergel(t0, t2);
    /* }}} */

    /* sum final values for sumtop, sumbot, sumsqtop, sumsqbot */
    vsumtop = vec_add(vsumtop, vsumbot);
    vsumsqtop = vec_add(vsumsqtop, vsumsqbot);
    vo.v = vec_add(vsumtop, vsumsqtop);

    sumtop = vo.sum.top;
    sumbot = vo.sum.bot;
    sumsqtop = vo.sum.sqtop;
    sumsqbot = vo.sum.sqbot;

    vsumbottop = vec_sums(vsumbottop, zero);

    vo.v = vsumbottop;


    /* Calculate Variances top and bottom.  If they're of similar
       sign estimate correlation if its good use frame DCT otherwise
       use field.
     */
    whichdct = FIELD_DCT;
    r = 0.0;
    topvar = sumsqtop-sumtop*sumtop/128;
    botvar = sumsqbot-sumbot*sumbot/128;
    if (!((topvar > 0) ^ (botvar > 0)))
    {
	sumbottop = vo.bottop.sum;

	d = ((double) topvar) * ((double)botvar);
	r = (sumbottop-(sumtop*sumbot)/128);
	if (r > (0.5 * sqrt(d)))
	    whichdct = FRAME_DCT;
    }

    AMBER_STOP;

    return whichdct;
}
Beispiel #18
0
/** Reduce **/
inline short v_reduce_sum(const v_int16x8& a)
{
    const vec_int4 zero = vec_int4_z;
    return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
}
Beispiel #19
0
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
{
    int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
    unsigned char *pB, *pB2, *pB4;
    vector unsigned char l0, l1, l2, l3;
    vector unsigned short s0, s1, s2, s3;
    vector unsigned short s22_0, s22_1, s22_2, s22_3;
    vector unsigned short s44, s44_0, s44_1;
    vector unsigned short zero, two;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "image", 16, image);
    if (NOT_VECTOR_ALIGNED(sub22_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub22_image", 16, sub22_image);
    if (NOT_VECTOR_ALIGNED(sub44_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub44_image", 16, sub44_image);

    if ((rowstride & 63) != 0)
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "rowstride", 64, rowstride);
#endif

    AMBER_START;

    pB = image;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(6,4,0);
    dsc.block.stride = rowstride;

    vec_dst(pB, dsc.control, 0);
#endif

    pB2 = sub22_image;
    pB4 = sub44_image;

    j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */

    stride1 = rowstride;
    stride2 = stride1 + stride1;
    stride3 = stride2 + stride1;
    stride4 = stride2 + stride2;
    halfstride = stride1 >> 1; /* /2 */

    ii = rowstride >> 6; /* rowstride/16/4 */

    zero = vec_splat_u16(0);
    two = vec_splat_u16(2);

    do {
	i = ii;
	do {
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;
#ifdef ALTIVEC_DST
	    vec_dst(pB + (16 * 3), dsc.control, 0);
#endif

	    /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
	    /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
	    /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
	    /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */

	    /* s0 = 0x[00,01,      02,03,      04,05,      06,07,     ] */
	    /*        [      10,11,      12,13,      14,15,      16,17] */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));

	    /* s1 = 0x[08,09,      0A,0B,      0C,0D,      0E,0F,     ] */
	    /*        [      18,19,      1A,1B,      1C,1D,      1E,1F] */
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));

	    /* s2 = 0x[20,21,      22,23,      24,25,      26,27,     ] */
	    /*        [      30,31,      32,33,      34,35,      36,37] */
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));

	    /* s3 = 0x[28,29,      2A,2B,      2C,2D,      2E,2F,     ] */
	    /*        [      38,39,      3A,3B,      3C,3D,      3E,3F] */
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next block */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;

	    /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
	    /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
	    /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
	    /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
	    s22_0 = vec_add(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
	    s22_1 = vec_add(s22_1, two);

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
	    s22_0 = vec_sra(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
	    s22_1 = vec_sra(s22_1, two);

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
	    s44_0 = vec_add(s22_0, s22_1);

	    /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);


	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* starting loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    s22_0 = vec_add(s22_0, two);
	    s22_1 = vec_add(s22_1, two);

	    s22_0 = vec_sra(s22_0, two);
	    s22_1 = vec_sra(s22_1, two);


	    s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44 = vec_add(s44, two);
	    s44 = vec_sra(s44, two);

	    s44_0 = vec_add(s22_0, s22_1);
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);

	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* pack all four s44 chunks */
	    s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44_0 = vec_add(s44_0, two);
	    s44_0 = vec_sra(s44_0, two);
	    s44 = vu16(vec_packsu(s44, s44_0));

	    vec_st(vu8(s44), 0, pB4);
	    pB4 += 16;

	} while (--i);

	pB += stride3;
	pB2 += halfstride;

    } while (--j);

#ifdef ALTIVEC_DST
    vec_dss(0);
#endif

    AMBER_STOP;
}