Exemplo n.º 1
0
static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
                           int big_endian, int output_bits)
{
    const int dst_u = -(uintptr_t)dest & 7;
    const int shift = 15 - output_bits;
    const int add = (1 << (shift - 1));
    const int clip = (1 << output_bits) - 1;
    const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add};
    const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
    const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift);
    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
    vector uint16_t v;
    int i;

    yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);

    for (i = dst_u; i < dstW - 7; i += 8) {
        v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
        v = vec_add(v, vadd);
        v = vec_sr(v, vshift);
        v = vec_min(v, vlargest);
        v = vec_rl(v, vswap);
        vec_st(v, 0, &dest[i]);
    }

    yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
}
Exemplo n.º 2
0
void
transfer8x8_copy_altivec_c( uint8_t * dst,
                            uint8_t * src,
                            uint32_t stride)
{
    register vector unsigned char tmp;
    register vector unsigned char mask;
	register vector unsigned char t0, t1;
    
#ifdef DEBUG
    if(((unsigned long)dst) & 0x7)
        fprintf(stderr, "transfer8x8_copy_altivec:incorrect align, dst: %lx\n", (long)dst);
    if(stride & 0x7)
        fprintf(stderr, "transfer8x8_copy_altivec:incorrect stride, stride: %u\n", stride);
#endif
    mask = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
    
    COPY8TO8();
    COPY8TO8();
    COPY8TO8();
    COPY8TO8();
    
    COPY8TO8();
    COPY8TO8();
    COPY8TO8();
    COPY8TO8();
}
Exemplo n.º 3
0
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
{
    vec_u16_t onev = vec_splat_u16(1);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3;

    s0 = vec_ld( 0x00, dct );
    s1 = vec_sld( s0, s0, 8 );
    s2 = vec_ld( 0x10, dct );
    s3 = vec_sld( s2, s2, 8 );

    vec_s16_t d0, d1, d2, d3;
    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );

    vec_s16_t tr0, tr1, tr2, tr3;

    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );

    vec_s16_t idct0, idct1, idct2, idct3;
    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );

    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
    vec_u16_t sixv = vec_splat_u16(6);
    LOAD_ZERO;

    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
}
Exemplo n.º 4
0
void transfer_16to8copy_altivec_c(uint8_t *dst,
                            vector signed short *src,
                            uint32_t stride)
{
    register vector signed short s;
    register vector unsigned char packed;
    register vector unsigned char mask_stencil;
    register vector unsigned char mask;
    register vector unsigned char load_src_perm;
    
#ifdef DEBUG
    /* if this is on, print alignment errors */
    if(((unsigned long) dst) & 0x7)
        fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, dst %lx\n", (long)dst);
    if(stride & 0x7)
        fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, stride %u\n", stride);
#endif
    /* Initialisation stuff */
    load_src_perm = vec_lvsl(0, (unsigned char*)src);
    mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
    
    COPY16TO8();
    COPY16TO8();
    COPY16TO8();
    COPY16TO8();
    
    COPY16TO8();
    COPY16TO8();
    COPY16TO8();
    COPY16TO8();
}
Exemplo n.º 5
0
uint32_t
sad8_altivec_c(const uint8_t * cur,
	   const uint8_t *ref,
	   const uint32_t stride)
{
	uint32_t result = 0;
	
	register vector unsigned int sad;
	register vector unsigned char c;
	register vector unsigned char r;
	
	/* initialize */
	sad = vec_splat_u32(0);
	
	/* Perform sad operations */
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	/* finish addition, add the first 2 together */
	sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0)));
	sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0));
	sad = vec_splat(sad,3);
	vec_ste(sad, 0, &result);
		
	return result;
}
Exemplo n.º 6
0
/* next one assumes that ((line_size % 8) == 0) */
static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
    register int i;
    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
    register vector unsigned char blockv, temp1, temp2;
    register vector unsigned short pixelssum1, pixelssum2, temp3;
    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);

    temp1 = vec_ld(0, pixels);
    temp2 = vec_ld(16, pixels);
    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
        pixelsv2 = temp2;
    } else {
        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
    }
    pixelsv1 = vec_mergeh(vczero, pixelsv1);
    pixelsv2 = vec_mergeh(vczero, pixelsv2);
    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                         (vector unsigned short)pixelsv2);
    pixelssum1 = vec_add(pixelssum1, vcone);

    for (i = 0; i < h ; i++) {
        int rightside = ((unsigned long)block & 0x0000000F);
        blockv = vec_ld(0, block);

        temp1 = vec_ld(line_size, pixels);
        temp2 = vec_ld(line_size + 16, pixels);
        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
            pixelsv2 = temp2;
        } else {
            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
        }

        pixelsv1 = vec_mergeh(vczero, pixelsv1);
        pixelsv2 = vec_mergeh(vczero, pixelsv2);
        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                             (vector unsigned short)pixelsv2);
        temp3 = vec_add(pixelssum1, pixelssum2);
        temp3 = vec_sra(temp3, vctwo);
        pixelssum1 = vec_add(pixelssum2, vcone);
        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);

        if (rightside) {
            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
        } else {
            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
        }

        vec_st(blockv, 0, block);

        block += line_size;
        pixels += line_size;
    }
}
Exemplo n.º 7
0
uint32_t
dequant_mpeg_intra_altivec_c(int16_t * data,
					 const int16_t * coeff,
					 const uint32_t quant,
					 const uint32_t dcscalar,
					 const uint16_t * mpeg_quant_matrices)
{
	register const uint16_t *intra_matrix = get_intra_matrix(mpeg_quant_matrices);
	register const int16_t *coeff_ptr = coeff;
	register int16_t *data_ptr = data;
	
	register vec_sint16_t ox00;
	register vec_sint16_t level;
	register vec_sint16_t vec_2048;
	register vec_uint16_t vintra;
	register vec_uint32_t swap;
	register vec_uint32_t even,odd;
	register vec_uint32_t et,ot,t;
	
	vec_uint32_t vquant;
	vector bool short zero_less;
	vector bool short overflow;
	
#ifdef DEBUG
	if((long)data & 0xf)
		fprintf(stderr, "xvidcore: error in dequant_mpeg_intra_altivec_c, incorrect align: %x\n", data);
#endif

	/* Initialize */
	ox00 = vec_splat_s16(0);
	*((uint32_t*)&vquant) = quant;
	vquant = vec_splat(vquant,0);
	
	swap = vec_rl(vquant, vec_splat_u32(-16));
	vec_2048 = (vec_sint16_t)vec_rl(vec_splat_u16(8),vec_splat_u16(8));
	
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	
	/* Process the first */
	data[0] = coeff[0] * dcscalar;
	if (data[0] < -2048) {
		data[0] = -2048;
	} else if (data[0] > 2047) {
		data[0] = 2047;
	}
		
	return 0;
}
Exemplo n.º 8
0
uint32_t
quant_h263_inter_altivec_c(int16_t *coeff,
                            int16_t *data,
                            const uint32_t quant,
                            const uint16_t *mpeg_quant_matrices)
{
    vector unsigned char zerovec;
    vector unsigned short mult;
    vector unsigned short quant_m_2;
    vector unsigned short quant_d_2;
    vector unsigned short sum_short;
    vector signed short acLevel;
    
    vector unsigned int even;
    vector unsigned int odd;
    
    vector bool short m2_mask;
    vector bool short zero_mask;
    
    uint32_t result;

#ifdef DEBUG
    if(((unsigned)coeff) & 0x15)
        fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff);
#endif
    
    /* initialisation stuff */
    zerovec = vec_splat_u8(0);
    *((unsigned short*)&mult) = (unsigned short)multipliers[quant];
    mult = vec_splat(mult, 0);
    *((unsigned short*)&quant_m_2) = (unsigned short)quant;
    quant_m_2 = vec_splat(quant_m_2, 0);
    quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1));
    *((unsigned short*)&quant_d_2) = (unsigned short)quant;
    quant_d_2 = vec_splat(quant_d_2, 0);
    quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1));
    sum_short = (vector unsigned short)zerovec;
    
    /* Quantize */
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
        
    /* Calculate the return value */
    even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec);
    even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec);
    even = vec_splat(even, 3);
    vec_ste(even, 0, &result);
    return result;
}
/* ************************************************************************* 

   NAME:  test_mladd

   USAGE: 

   test_madd();

   returns: void

   DESCRIPTION:
                   see how the combination multiply/add operation works
                   this will work on low order 16-bits 

   REFERENCES:

   Ian Ollmann's Altivec Tutorial
   
   LIMITATIONS:

   GLOBAL VARIABLES:

      accessed: none

      modified: none

   FUNCTIONS CALLED:
   
   fprintf
   vec_madd - multiply two short vectors and add to the sum a short
              all in one operation
 
   
   REVISION HISTORY:
        STR                  Description of Revision                 Author
     06-Mar-11               initial coding                           kaj

 ************************************************************************* */
void test_mladd(void)
{
  vector unsigned short shortVector1 =
       { 0, 2, 4, 8, 16, 32, 64, 128 };
  vector unsigned short addVector;
  vector unsigned short coeffVector;
  vector unsigned short resultVector;
  vector short shortVector2 =
       { -128, -64, -32, -16, 0, 16, 32, 64};
  vector short addVector2 = 
       { -10, -10, -10, -10, 0, 10, 10, 10};
  vector short coeffVector2;
  vector short resultVector2;
  short printshort[SHORT_ARRAYSIZE] __attribute__ ((aligned (16)));

  coeffVector = vec_splat_u16(2);
  addVector = vec_splat_u16(0);

  /* print vectors performing mladd on */
  fprintf(stderr,"-----------------------------------------------------------"
                 "\n\n");
  printVecUShorts("vec_mladd unsigned input vector 1", shortVector1,
                   SHORT_ARRAYSIZE);
  printVecUShorts("vec_mladd unsigned input vector to add", addVector,
                   SHORT_ARRAYSIZE);  
  printVecUShorts("vec_mladd unsigned coeffvector to multiply", coeffVector,
                   SHORT_ARRAYSIZE); 
  
  /* calculate */
  resultVector = vec_mladd(shortVector1,coeffVector,addVector);
  printVecUShorts("vec_mladd vector (Input*2+0)", resultVector,
                   SHORT_ARRAYSIZE);

  /* signed shorts */
  coeffVector2 = vec_splat_s16(2);
    
  /* print signed short vectors performing mladd on */
  fprintf(stderr,"----------------------------------------------------------"
                 "\n\n");
  printVecShorts("vec_mladd signed input vector 1", shortVector2,
                  SHORT_ARRAYSIZE);
  printVecShorts("vec_mladd signed input vector to add", addVector2,
                  SHORT_ARRAYSIZE);  
  printVecShorts("vec_mladd signed coeffvector to multiply", coeffVector2,
                  SHORT_ARRAYSIZE); 
  
  /* calculate */
  resultVector2 = vec_mladd(shortVector2,coeffVector2,addVector2);
  printVecShorts("vec_mladd vector (Input*2 + 10(increment pos & neg by 10)",
                  resultVector2,SHORT_ARRAYSIZE);

} /* test_mladd */
Exemplo n.º 10
0
static void predict_16x16_p_altivec( uint8_t *src )
{
    int16_t a, b, c, i;
    int H = 0;
    int V = 0;
    int16_t i00;

    for( i = 1; i <= 8; i++ )
    {
        H += i * ( src[7+i - FDEC_STRIDE ]  - src[7-i - FDEC_STRIDE ] );
        V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );
    }

    a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
    b = ( 5 * H + 32 ) >> 6;
    c = ( 5 * V + 32 ) >> 6;
    i00 = a - b * 7 - c * 7 + 16;

    vect_sshort_u i00_u, b_u, c_u;
    i00_u.s[0] = i00;
    b_u.s[0]   = b;
    c_u.s[0]   = c;

    vec_u16_t val5_v = vec_splat_u16(5);
    vec_s16_t i00_v, b_v, c_v;
    i00_v = vec_splat(i00_u.v, 0);
    b_v = vec_splat(b_u.v, 0);
    c_v = vec_splat(c_u.v, 0);
    vec_s16_t induc_v  = (vec_s16_t) CV(0,  1,  2,  3,  4,  5,  6,  7);
    vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3));
    vec_s32_t mule_b_v = vec_mule(induc_v, b_v);
    vec_s32_t mulo_b_v = vec_mulo(induc_v, b_v);
    vec_s16_t mul_b_induc0_v = vec_pack(vec_mergeh(mule_b_v, mulo_b_v), vec_mergel(mule_b_v, mulo_b_v));
    vec_s16_t add_i0_b_0v = vec_adds(i00_v, mul_b_induc0_v);
    vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v);

    int y;

    for( y = 0; y < 16; y++ )
    {
        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
        vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v);
        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v);
        vec_st( com_sat_v, 0, &src[0]);
        src += FDEC_STRIDE;
        i00 += c;
        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
        add_i0_b_8v = vec_adds(add_i0_b_8v, c_v);
    }
}
/* ************************************************************************* 

   NAME:  test_add_subtract

   USAGE: 

   test_add_subtract();

   returns: void

   DESCRIPTION:
                   see how add and subtract work on vectors

   REFERENCES:

   Ian Ollmann's Altivec Tutorial
   
   LIMITATIONS:

   GLOBAL VARIABLES:

      accessed: none

      modified: none

   FUNCTIONS CALLED:
   
   fprintf
   vec_add - add two vectors
   vec_adds - add two vectors, saturation
   vec_sub - subtract two vectors
   
   REVISION HISTORY:
        STR                  Description of Revision                 Author
     27-Feb-11               initial coding                           kaj

 ************************************************************************* */
void test_add_subtract(void)
{
  vector unsigned short addVector1 = 
       { 0, 1000, 5000, 10000, 15000, 20000, 50000, 65535};
  vector signed short addSVector1 = 
       { -32768, -10000, -5000, 0, 10, 5000, 10000, 32767};
  vector signed short addSVector2 = 
       { -10, -10, -10, 0, 10, 10, 10, 10};
  vector unsigned short sumVector;
  vector signed short sumSVector;
  short printshort[SHORT_ARRAYSIZE] __attribute__ ((aligned (16)));


  /* vec_add should wrap, vec_adds will chop at max/min */

  fprintf(stderr,"-----------------------------------------------------------\n");
  
  /* add 10 to each element - unsigned short */
  printVecUShorts("vec_add unsigned short input vector", addVector1,SHORT_ARRAYSIZE); 
  sumVector = vec_add(addVector1, vec_splat_u16(10));
  printVecUShorts("vec_add sum vector (Input+10) ", sumVector,SHORT_ARRAYSIZE);

  /* add 10 to each element using saturation add - unsigned short */
  sumVector = vec_adds(addVector1, vec_splat_u16(10));
  printVecUShorts("vec_adds sum vector (Input+10)", sumVector,SHORT_ARRAYSIZE);

  /* subtract 10 from each element - unsigned short */
  sumVector = vec_sub(addVector1, vec_splat_u16(10));
  printVecUShorts("vec_sub sum vector (Input-10) ", sumVector,SHORT_ARRAYSIZE);

  fprintf(stderr,"-----------------------------------------------------------\n\n");

   /* add 10 to each element - signed short */
  printVecShorts("vec_add signed short input vector", addSVector1,SHORT_ARRAYSIZE); 
  sumSVector = vec_add(addSVector1,addSVector2);
  printVecShorts("vec_add sum vector (increment pos & neg by 10) ",
                      sumSVector,SHORT_ARRAYSIZE);

   /* add 10 to each element using saturation add - signed short */
  sumSVector = vec_adds(addSVector1,addSVector2);
  printVecShorts("vec_adds sum vector (increment pos & neg by 10)",
                     sumSVector,SHORT_ARRAYSIZE);

  /* subtract 10 from each element - signed short */
  sumSVector = vec_sub(addSVector1,addSVector2);
  printVecShorts("vec_sub vector (decrement pos & neg by 10) ",
                      sumSVector,SHORT_ARRAYSIZE);


} /* test_add_subtract */
Exemplo n.º 12
0
void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] )
{
    vec_s16_t dcv;
    vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) );
    vec_u16_t v6 = vec_splat_u16( 6 );
    vec_s16_t dctv = vec_vsx_ld( 0, dct );

    dctv = vec_sra( vec_add( dctv, v32 ), v6 );
    dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) );
    dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
    idct8_dc_altivec( &p_dst[0], dcv );
    dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) );
    dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
    idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv );
}
Exemplo n.º 13
0
static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
                           int big_endian, int output_bits)
{
    const int dst_u = -(uintptr_t)dest & 7;
    const int shift = 3;
    const int add = (1 << (shift - 1));
    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
    const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
    const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
    vector uint32_t v, v2;
    vector uint16_t vd;
    int i;

    yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);

    for (i = dst_u; i < dstW - 7; i += 8) {
        v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
        v = vec_add(v, vadd);
        v = vec_sr(v, vshift);

        v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
        v2 = vec_add(v2, vadd);
        v2 = vec_sr(v2, vshift);

        vd = vec_packsu(v, v2);
        vd = vec_rl(vd, vswap);

        vec_st(vd, 0, &dest[i]);
    }

    yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
}
Exemplo n.º 14
0
void
transfer_8to16sub_altivec_c(int16_t * dct,
							uint8_t * cur,
							uint8_t * ref,
							const uint32_t stride)
{
	register vector unsigned char c,r;
	register vector unsigned char ox00;
	register vector unsigned char mask_00ff;
	register vector unsigned char mask;
	register vector signed short cs,rs;
	
#ifdef DEBUG
	if((long)dct & 0xf)
		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, dct: %lx\n", (long)dct);
	if((long)cur & 0x7)
		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, cur: %lx\n", (long)cur);
	if(stride & 0x7)
		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect stride, stride: %lu\n", (long)stride);
#endif
	/* initialize */
	ox00 = vec_splat_u8(0);
	mask_00ff = vec_pack((vector unsigned short)ox00,vec_splat_u16(-1));
	
	SUB8TO16();
	SUB8TO16();
	SUB8TO16();
	SUB8TO16();
	
	SUB8TO16();
	SUB8TO16();
	SUB8TO16();
	SUB8TO16();
}
Exemplo n.º 15
0
void ff_vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
{
    vec_u8 t;
    IDCT_START

    // pixels are signed; so add 128*16 in addition to the normal 8
    vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
    eight = vec_add(eight, v2048);

    IDCT_1D(NOP, NOP)
    TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
    IDCT_1D(ADD8, SHIFT4)

#define PUT(a)\
    t = vec_packsu(a, a);\
    vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
    vec_ste((vec_u32)t, 4, (unsigned int *)dst);

    PUT(b0)     dst += stride;
    PUT(b1)     dst += stride;
    PUT(b2)     dst += stride;
    PUT(b3)     dst += stride;
    PUT(b4)     dst += stride;
    PUT(b5)     dst += stride;
    PUT(b6)     dst += stride;
    PUT(b7)
}
Exemplo n.º 16
0
static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride )
{
    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;

    vec_u8 perm_ldv = vec_lvsl(0, dst);
    vec_u8 perm_stv = vec_lvsr(8, dst);

    const vec_u16 onev = vec_splat_u16(1);
    const vec_u16 twov = vec_splat_u16(2);
    const vec_u16 sixv = vec_splat_u16(6);

    const vec_u8 sel = (vec_u8)
    {
        0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1
    };
    LOAD_ZERO;

    dct[0] += 32; // rounding for the >>6 at the end

    s0 = vec_ld(0x00, (int16_t *)dct);
    s1 = vec_ld(0x10, (int16_t *)dct);
    s2 = vec_ld(0x20, (int16_t *)dct);
    s3 = vec_ld(0x30, (int16_t *)dct);
    s4 = vec_ld(0x40, (int16_t *)dct);
    s5 = vec_ld(0x50, (int16_t *)dct);
    s6 = vec_ld(0x60, (int16_t *)dct);
    s7 = vec_ld(0x70, (int16_t *)dct);

    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
                     d0, d1, d2, d3, d4, d5, d6, d7);

    TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );

    IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);

    ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
}
Exemplo n.º 17
0
void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
{
    vec_u16_t onev = vec_splat_u16(1);
    vec_u16_t twov = vec_splat_u16(2);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;

    s0 = vec_ld(0x00, dct);
    s1 = vec_ld(0x10, dct);
    s2 = vec_ld(0x20, dct);
    s3 = vec_ld(0x30, dct);
    s4 = vec_ld(0x40, dct);
    s5 = vec_ld(0x50, dct);
    s6 = vec_ld(0x60, dct);
    s7 = vec_ld(0x70, dct);

    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7);

    vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7;

    VEC_TRANSPOSE_8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
                    tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);

    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
    IDCT8_1D_ALTIVEC(tr0,     tr1,   tr2,   tr3,   tr4,   tr5,   tr6,   tr7,
                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);

    vec_u8_t perm_ldv = vec_lvsl(0, dst);
    vec_u8_t perm_stv = vec_lvsr(8, dst);
    vec_u16_t sixv = vec_splat_u16(6);
    const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
    LOAD_ZERO;

    ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel);
}
Exemplo n.º 18
0
void foo (void) 
{
  vector bool int boolVec1 = (vector bool int) vec_splat_u32(3);
  vector bool short boolVec2 = (vector bool short) vec_splat_u16(3);
  vector bool char boolVec3 = (vector bool char) vec_splat_u8(3);

  boolVec1 = vec_sld( boolVec1, boolVec1, 4 );
  boolVec2 = vec_sld( boolVec2, boolVec2, 2 );
  boolVec3 = vec_sld( boolVec3, boolVec3, 1 );
}
Exemplo n.º 19
0
void
jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
                               JDIMENSION v_samp_factor,
                               JDIMENSION width_blocks,
                               JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  int outrow, outcol;
  JDIMENSION output_cols = width_blocks * DCTSIZE;
  JSAMPROW inptr, outptr;

  __vector unsigned char this0, next0, out;
  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;

  /* Constants */
  __vector unsigned short pw_bias = { __4X2(0, 1) },
    pw_one = { __8X(1) };
  __vector unsigned char even_odd_index =
    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
    pb_zero = { __16X(0) };

  expand_right_edge(input_data, max_v_samp_factor, image_width,
                    output_cols * 2);

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr = input_data[outrow];

    for (outcol = output_cols; outcol > 0;
         outcol -= 16, inptr += 32, outptr += 16) {

      this0 = vec_ld(0, inptr);
      this0 = vec_perm(this0, this0, even_odd_index);
      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
      outl = vec_add(this0e, this0o);
      outl = vec_add(outl, pw_bias);
      outl = vec_sr(outl, pw_one);

      if (outcol > 8) {
        next0 = vec_ld(16, inptr);
        next0 = vec_perm(next0, next0, even_odd_index);
        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
        outh = vec_add(next0e, next0o);
        outh = vec_add(outh, pw_bias);
        outh = vec_sr(outh, pw_one);
      } else
        outh = vec_splat_u16(0);

      out = vec_pack(outl, outh);
      vec_st(out, 0, outptr);
    }
  }
}
Exemplo n.º 20
0
static void predict_16x16_p_altivec( uint8_t *src )
{
    int H = 0, V = 0;

    for( int i = 1; i <= 8; i++ )
    {
        H += i * ( src[7+i - FDEC_STRIDE ]  - src[7-i - FDEC_STRIDE ] );
        V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] );
    }

    int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );
    int b = ( 5 * H + 32 ) >> 6;
    int c = ( 5 * V + 32 ) >> 6;
    int i00 = a - b * 7 - c * 7 + 16;

    vec_s16_u i00_u, b_u, c_u;
    i00_u.s[0] = i00;
    b_u.s[0]   = b;
    c_u.s[0]   = c;

    vec_u16_t val5_v = vec_splat_u16(5);
    vec_s16_t i00_v, b_v, c_v;
    i00_v = vec_splat(i00_u.v, 0);
    b_v = vec_splat(b_u.v, 0);
    c_v = vec_splat(c_u.v, 0);
    vec_s16_t induc_v  = (vec_s16_t) CV(0,  1,  2,  3,  4,  5,  6,  7);
    vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3));
    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);
    vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v);

    for( int y = 0; y < 16; y++ )
    {
        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
        vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v);
        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v);
        vec_st( com_sat_v, 0, &src[0]);
        src += FDEC_STRIDE;
        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);
        add_i0_b_8v = vec_adds(add_i0_b_8v, c_v);
    }
}
Exemplo n.º 21
0
uint32_t
dequant_h263_inter_altivec_c(int16_t *data,
                                int16_t *coeff,
                                const uint32_t quant,
                                const uint16_t *mpeg_quant_matrices)
{
    vector signed short acLevel;
    vector signed short vec_2048;
    
    vector unsigned short quant_m_2;
    vector unsigned short quant_add;
    vector unsigned short t;
    
    register vector unsigned int even;
    register vector unsigned int odd;
    register vector unsigned int high;
    register vector unsigned int low;
    
    register vector unsigned char zerovec;
    
    vector bool short equal_zero;
    vector bool short less_zero;
    vector bool short overflow;
    
#ifdef DEBUG
    /* print alignment errors if this is on */
    if(((unsigned)data) & 0x15)
        fprintf(stderr, "dequant_h263_inter_altivec_c:incorrect align, data: %lx\n", (long)data);
#endif
    
    /* initialize */
    *((unsigned short*)&quant_m_2) = (unsigned short)(quant << 1);
    quant_m_2 = vec_splat(quant_m_2,0);
    
    *((unsigned short*)&quant_add) = (unsigned short)(quant & 1 ? quant : quant - 1);
    quant_add = vec_splat(quant_add,0);
    
    vec_2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
    zerovec = vec_splat_u8(0);
    
    /* dequant */
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    DEQUANT_H263_INTER_ALTIVEC();
    
    return 0;
}
Exemplo n.º 22
0
static force_inline vector unsigned int
pix_multiply (vector unsigned int p, vector unsigned int a)
{
    vector unsigned short hi, lo, mod;

    /* unpack to short */
    hi = (vector unsigned short)
	vec_mergeh ((vector unsigned char)AVV (0),
		    (vector unsigned char)p);

    mod = (vector unsigned short)
	vec_mergeh ((vector unsigned char)AVV (0),
		    (vector unsigned char)a);

    hi = vec_mladd (hi, mod, (vector unsigned short)
                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
                         0x0080, 0x0080, 0x0080, 0x0080));

    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));

    hi = vec_sr (hi, vec_splat_u16 (8));

    /* unpack to short */
    lo = (vector unsigned short)
	vec_mergel ((vector unsigned char)AVV (0),
		    (vector unsigned char)p);
    mod = (vector unsigned short)
	vec_mergel ((vector unsigned char)AVV (0),
		    (vector unsigned char)a);

    lo = vec_mladd (lo, mod, (vector unsigned short)
                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
                         0x0080, 0x0080, 0x0080, 0x0080));

    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));

    lo = vec_sr (lo, vec_splat_u16 (8));

    return (vector unsigned int)vec_packsu (hi, lo);
}
Exemplo n.º 23
0
void
transfer_8to16sub2_altivec_c(vector signed short *dct,
                             uint8_t *cur,
                             uint8_t *ref1,
                             uint8_t *ref2,
                             const uint32_t stride)
{
    vector unsigned char r1;
    vector unsigned char r2;
    vector unsigned char r;
    vector unsigned char c;
    vector unsigned char mask;
    vector signed short cs;
    vector signed short rs;
    
#ifdef DEBUG
    /* Dump alignment erros if DEBUG is set */
    if(((unsigned long)dct) & 0xf)
        fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, dct: %lx\n", (long)dct);
    if(((unsigned long)cur) & 0x7)
        fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, cur: %lx\n", (long)cur);
    if(stride & 0x7)
        fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, dct: %u\n", stride);
#endif
    
    /* Initialisation */
    mask = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
    
    SUB28TO16();
    SUB28TO16();
    SUB28TO16();
    SUB28TO16();
    
    SUB28TO16();
    SUB28TO16();
    SUB28TO16();
    SUB28TO16();
}
Exemplo n.º 24
0
uint32_t
quant_h263_intra_altivec_c(int16_t *coeff,
                                    int16_t *data,
                                    const uint32_t quant,
                                    const uint32_t dcscalar,
                                    const uint16_t *mpeg_quant_matrices)
{
    vector unsigned char zerovec;
    vector unsigned short mult;
    vector unsigned short quant_m_2;
    vector signed short acLevel;
    
    register vector unsigned int even;
    register vector unsigned int odd;
    
    vector bool short zero_mask;
    vector bool short m2_mask;
    
    register int16_t *origin_coeff = coeff;
    register int16_t *origin_data = data;

#ifdef DEBUG
    if(((unsigned)coeff) & 15)
        fprintf(stderr, "quant_h263_intra_altivec_c:incorrect align, coeff: %lx\n", (long)coeff);
#endif
    
    zerovec = vec_splat_u8(0);
    
    *((unsigned short*)&mult) = (unsigned short)multipliers[quant];
    mult = vec_splat(mult, 0);
    
    *((unsigned short*)&quant_m_2) = (unsigned short)quant;
    quant_m_2 = vec_splat(quant_m_2, 0);
    quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1));
    
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    QUANT_H263_INTRA_ALTIVEC();
    
    // noch erstes setzen
    origin_coeff[0] = DIV_DIV(origin_data[0], (int32_t)dcscalar);
    
    return 0;
}
Exemplo n.º 25
0
static av_always_inline
void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                 uint8_t *src, ptrdiff_t src_stride,
                                 int h, int mx, int w, int is6tap)
{
    LOAD_H_SUBPEL_FILTER(mx-1);
    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
    vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
    vec_u8 a, b, pixh, pixl, outer;
    vec_s16 f16h, f16l;
    vec_s32 filth, filtl;

    vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
    vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
    vec_u8 perm_inner  = is6tap ? perm_inner6 : perm_inner4;
    vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
    vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
    vec_u16 c7  = vec_splat_u16(7);

    align_vec0 = vec_lvsl( -is6tap-1, src);
    align_vec8 = vec_lvsl(8-is6tap-1, src);

    permh0     = vec_perm(align_vec0, align_vec0, perm_inner);
    permh8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_inner = vec_add(perm_inner, vec_splat_u8(4));
    perml0     = vec_perm(align_vec0, align_vec0, perm_inner);
    perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
    perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);

    while (h --> 0) {
        FILTER_H(f16h, 0);

        if (w == 16) {
            FILTER_H(f16l, 8);
            filt = vec_packsu(f16h, f16l);
            vec_st(filt, 0, dst);
        } else {
            filt = vec_packsu(f16h, f16h);
            vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
            if (w == 8)
                vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
        }
        src += src_stride;
        dst += dst_stride;
    }
}
Exemplo n.º 26
0
void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
{
    vec_u16_t onev = vec_splat_u16(1);
    vec_u16_t twov = vec_add( onev, onev );

    PREP_DIFF_8BYTEALIGNED;

    vec_s16_t dct0v, dct1v, dct2v, dct3v,
              dct4v, dct5v, dct6v, dct7v;

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );

    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
    VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );

    DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
                     dct4v, dct5v, dct6v, dct7v );

    vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
        dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;

    VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
                    dct4v, dct5v, dct6v, dct7v,
                    dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
                    dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );

    DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
                     dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );

    vec_st( dct_tr0v,  0,  dct );
    vec_st( dct_tr1v, 16,  dct );
    vec_st( dct_tr2v, 32,  dct );
    vec_st( dct_tr3v, 48,  dct );

    vec_st( dct_tr4v, 64,  dct );
    vec_st( dct_tr5v, 80,  dct );
    vec_st( dct_tr6v, 96,  dct );
    vec_st( dct_tr7v, 112, dct );
}
Exemplo n.º 27
0
static void predict_8x8c_p_altivec( uint8_t *src )
{
    int H = 0, V = 0;

    for( int i = 0; i < 4; i++ )
    {
        H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] );
        V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] );
    }

    int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
    int b = ( 17 * H + 16 ) >> 5;
    int c = ( 17 * V + 16 ) >> 5;
    int i00 = a -3*b -3*c + 16;

    vec_s16_u i00_u, b_u, c_u;
    i00_u.s[0] = i00;
    b_u.s[0]   = b;
    c_u.s[0]   = c;

    vec_u16_t val5_v = vec_splat_u16(5);
    vec_s16_t i00_v, b_v, c_v;
    i00_v = vec_splat(i00_u.v, 0);
    b_v = vec_splat(b_u.v, 0);
    c_v = vec_splat(c_u.v, 0);

    vec_s16_t induc_v  = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
    vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);

    PREP_STORE8;

    for( int i = 0; i < 8; ++i )
    {
        vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
        vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v);
        VEC_STORE8(com_sat_v, &src[0]);
        src += FDEC_STRIDE;
        add_i0_b_0v = vec_adds(add_i0_b_0v, c_v);

    }
}
Exemplo n.º 28
0
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
    vec_s16 va0, va1, va2, va3;
    vec_s16 vz0, vz1, vz2, vz3;
    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
    vec_u8 va_u8;
    vec_u32 va_u32;
    vec_s16 vdst_ss;
    const vec_u16 v6us = vec_splat_u16(6);
    vec_u8 vdst, vdst_orig;
    vec_u8 vdst_mask = vec_lvsl(0, dst);
    int element = ((unsigned long)dst & 0xf) >> 2;
    LOAD_ZERO;

    block[0] += 32;  /* add 32 as a DC-level for rounding */

    vtmp0 = vec_ld(0,block);
    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
    vtmp2 = vec_ld(16,block);
    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
    memset(block, 0, 16 * sizeof(int16_t));

    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);

    va0 = vec_sra(va0,v6us);
    va1 = vec_sra(va1,v6us);
    va2 = vec_sra(va2,v6us);
    va3 = vec_sra(va3,v6us);

    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
}
Exemplo n.º 29
0
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
    register int i;

    LOAD_ZERO;
    const vec_u8 permM2 = vec_lvsl(-2, src);
    const vec_u8 permM1 = vec_lvsl(-1, src);
    const vec_u8 permP0 = vec_lvsl(+0, src);
    const vec_u8 permP1 = vec_lvsl(+1, src);
    const vec_u8 permP2 = vec_lvsl(+2, src);
    const vec_u8 permP3 = vec_lvsl(+3, src);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_u16 v5us = vec_splat_u16(5);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));

    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;

    register int align = ((((unsigned long)src) - 2) % 16);

    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
              srcP2A, srcP2B, srcP3A, srcP3B,
              srcM1A, srcM1B, srcM2A, srcM2B,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
              psumA, psumB, sumA, sumB;

    vec_u8 sum, fsum;

    for (i = 0 ; i < 16 ; i ++) {
        vec_u8 srcR1 = vec_ld(-2, src);
        vec_u8 srcR2 = vec_ld(14, src);

        switch (align) {
        default: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = vec_perm(srcR1, srcR2, permP3);
        } break;
        case 11: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = srcR2;
        } break;
        case 12: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = srcR2;
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 13: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = srcR2;
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 14: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = srcR2;
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 15: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = srcR2;
            srcP0 = vec_perm(srcR2, srcR3, permP0);
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        }

        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);

        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);

        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);

        sum1A = vec_adds(srcP0A, srcP1A);
        sum1B = vec_adds(srcP0B, srcP1B);
        sum2A = vec_adds(srcM1A, srcP2A);
        sum2B = vec_adds(srcM1B, srcP2B);
        sum3A = vec_adds(srcM2A, srcP3A);
        sum3B = vec_adds(srcM2B, srcP3B);

        pp1A = vec_mladd(sum1A, v20ss, v16ss);
        pp1B = vec_mladd(sum1B, v20ss, v16ss);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        pp3A = vec_add(sum3A, pp1A);
        pp3B = vec_add(sum3B, pp1B);

        psumA = vec_sub(pp3A, pp2A);
        psumB = vec_sub(pp3B, pp2B);

        sumA = vec_sra(psumA, v5us);
        sumB = vec_sra(psumB, v5us);

        sum = vec_packsu(sumA, sumB);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        src += srcStride;
        dst += dstStride;
    }
}
Exemplo n.º 30
0
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
    register int i;
    LOAD_ZERO;
    const vec_u8 permM2 = vec_lvsl(-2, src);
    const vec_u8 permM1 = vec_lvsl(-1, src);
    const vec_u8 permP0 = vec_lvsl(+0, src);
    const vec_u8 permP1 = vec_lvsl(+1, src);
    const vec_u8 permP2 = vec_lvsl(+2, src);
    const vec_u8 permP3 = vec_lvsl(+3, src);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_u32 v10ui = vec_splat_u32(10);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_s16 v1ss = vec_splat_s16(1);
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));

    register int align = ((((unsigned long)src) - 2) % 16);

    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
              srcP2A, srcP2B, srcP3A, srcP3B,
              srcM1A, srcM1B, srcM2A, srcM2B,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;

    const vec_u8 mperm = (const vec_u8)
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
    int16_t *tmpbis = tmp;

    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
              tmpP2ssA, tmpP2ssB;

    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
              ssumAe, ssumAo, ssumBe, ssumBo;
    vec_u8 fsum, sumv, sum;
    vec_s16 ssume, ssumo;

    src -= (2 * srcStride);
    for (i = 0 ; i < 21 ; i ++) {
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
        vec_u8 srcR1 = vec_ld(-2, src);
        vec_u8 srcR2 = vec_ld(14, src);

        switch (align) {
        default: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = vec_perm(srcR1, srcR2, permP3);
        } break;
        case 11: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = srcR2;
        } break;
        case 12: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = srcR2;
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 13: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = srcR2;
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 14: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = srcR2;
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 15: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = srcR2;
            srcP0 = vec_perm(srcR2, srcR3, permP0);
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        }

        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);

        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);

        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);

        sum1A = vec_adds(srcP0A, srcP1A);
        sum1B = vec_adds(srcP0B, srcP1B);
        sum2A = vec_adds(srcM1A, srcP2A);
        sum2B = vec_adds(srcM1B, srcP2B);
        sum3A = vec_adds(srcM2A, srcP3A);
        sum3B = vec_adds(srcM2B, srcP3B);

        pp1A = vec_mladd(sum1A, v20ss, sum3A);
        pp1B = vec_mladd(sum1B, v20ss, sum3B);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        psumA = vec_sub(pp1A, pp2A);
        psumB = vec_sub(pp1B, pp2B);

        vec_st(psumA, 0, tmp);
        vec_st(psumB, 16, tmp);

        src += srcStride;
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
    }

    tmpM2ssA = vec_ld(0, tmpbis);
    tmpM2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpM1ssA = vec_ld(0, tmpbis);
    tmpM1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP0ssA = vec_ld(0, tmpbis);
    tmpP0ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP1ssA = vec_ld(0, tmpbis);
    tmpP1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP2ssA = vec_ld(0, tmpbis);
    tmpP2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;

    for (i = 0 ; i < 16 ; i++) {
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);

        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);

        tmpbis += tmpStride;

        tmpM2ssA = tmpM1ssA;
        tmpM2ssB = tmpM1ssB;
        tmpM1ssA = tmpP0ssA;
        tmpM1ssB = tmpP0ssB;
        tmpP0ssA = tmpP1ssA;
        tmpP0ssB = tmpP1ssB;
        tmpP1ssA = tmpP2ssA;
        tmpP1ssB = tmpP2ssB;
        tmpP2ssA = tmpP3ssA;
        tmpP2ssB = tmpP3ssB;

        pp1Ae = vec_mule(sum1A, v20ss);
        pp1Ao = vec_mulo(sum1A, v20ss);
        pp1Be = vec_mule(sum1B, v20ss);
        pp1Bo = vec_mulo(sum1B, v20ss);

        pp2Ae = vec_mule(sum2A, v5ss);
        pp2Ao = vec_mulo(sum2A, v5ss);
        pp2Be = vec_mule(sum2B, v5ss);
        pp2Bo = vec_mulo(sum2B, v5ss);

        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
        pp3Ao = vec_mulo(sum3A, v1ss);
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
        pp3Bo = vec_mulo(sum3B, v1ss);

        pp1cAe = vec_add(pp1Ae, v512si);
        pp1cAo = vec_add(pp1Ao, v512si);
        pp1cBe = vec_add(pp1Be, v512si);
        pp1cBo = vec_add(pp1Bo, v512si);

        pp32Ae = vec_sub(pp3Ae, pp2Ae);
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
        pp32Be = vec_sub(pp3Be, pp2Be);
        pp32Bo = vec_sub(pp3Bo, pp2Bo);

        sumAe = vec_add(pp1cAe, pp32Ae);
        sumAo = vec_add(pp1cAo, pp32Ao);
        sumBe = vec_add(pp1cBe, pp32Be);
        sumBo = vec_add(pp1cBo, pp32Bo);

        ssumAe = vec_sra(sumAe, v10ui);
        ssumAo = vec_sra(sumAo, v10ui);
        ssumBe = vec_sra(sumBe, v10ui);
        ssumBo = vec_sra(sumBo, v10ui);

        ssume = vec_packs(ssumAe, ssumBe);
        ssumo = vec_packs(ssumAo, ssumBo);

        sumv = vec_packsu(ssume, ssumo);
        sum = vec_perm(sumv, sumv, mperm);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        dst += dstStride;
    }
}