示例#1
0
文件: h264dsp.c 项目: AVbin/libav
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
    vec_s16 dc16;
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
    LOAD_ZERO;
    DECLARE_ALIGNED(16, int, dc);
    int i;

    dc = (block[0] + 32) >> 6;
    block[0] = 0;
    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);

    if (size == 4)
        dc16 = vec_sld(dc16, zero_s16v, 8);
    dcplus = vec_packsu(dc16, zero_s16v);
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);

    aligner = vec_lvsr(0, dst);
    dcplus = vec_perm(dcplus, dcplus, aligner);
    dcminus = vec_perm(dcminus, dcminus, aligner);

    for (i = 0; i < size; i += 4) {
        v0 = vec_ld(0, dst+0*stride);
        v1 = vec_ld(0, dst+1*stride);
        v2 = vec_ld(0, dst+2*stride);
        v3 = vec_ld(0, dst+3*stride);

        v0 = vec_adds(v0, dcplus);
        v1 = vec_adds(v1, dcplus);
        v2 = vec_adds(v2, dcplus);
        v3 = vec_adds(v3, dcplus);

        v0 = vec_subs(v0, dcminus);
        v1 = vec_subs(v1, dcminus);
        v2 = vec_subs(v2, dcminus);
        v3 = vec_subs(v3, dcminus);

        vec_st(v0, 0, dst+0*stride);
        vec_st(v1, 0, dst+1*stride);
        vec_st(v2, 0, dst+2*stride);
        vec_st(v3, 0, dst+3*stride);

        dst += 4*stride;
    }
}
示例#2
0
文件: altivec-20.c 项目: Alexpux/GCC
void foo( float scalar)
{
    unsigned long width;
    unsigned long x;
    vector float vColor;
    vector unsigned int selectMask;
    vColor = vec_perm( vec_ld( 0, &scalar), vec_ld( 3, &scalar), vec_lvsl( 0, &scalar) );

    float *destRow;
    vector float store, load0;

    for( ; x < width; x++)
    {
            load0 = vec_sel( vColor, load0, selectMask );
            vec_st( store, 0, destRow );
            store = load0;
    }
}
示例#3
0
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
    vector unsigned char pix2l, pix2r;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for (i = 0; i < h; i++) {
        /* Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
        pix1v  = vec_ld( 0, pix1);
        pix2l  = vec_ld( 0, pix2);
        pix2r  = vec_ld(16, pix2);
        pix2v  = vec_perm(pix2l, pix2r, perm1);
        pix2iv = vec_perm(pix2l, pix2r, perm2);

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
示例#4
0
文件: predict.c 项目: 0x0B501E7E/x264
static void predict_16x16_h_altivec( uint8_t *src )
{
    for( int i = 0; i < 16; i++ )
    {
        vec_u8_t v = vec_ld(-1, src);
        vec_u8_t v_v = vec_splat(v, 15);
        vec_st(v_v, 0, src);

        src += FDEC_STRIDE;
    }
}
示例#5
0
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
                                  const int16_t **src, uint8_t *dest,
                                  const uint8_t *dither, int offset, int x)
{
    register int i, j;
    LOCAL_ALIGNED(16, int, val, [16]);
    vector signed int vo1, vo2, vo3, vo4;
    vector unsigned short vs1, vs2;
    vector unsigned char vf;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));

    for (i = 0; i < 16; i++)
        val[i] = dither[(x + i + offset) & 7] << 12;

    vo1 = vec_ld(0,  val);
    vo2 = vec_ld(16, val);
    vo3 = vec_ld(32, val);
    vo4 = vec_ld(48, val);

    for (j = 0; j < filterSize; j++) {
        unsigned int joffset=j<<1;
        unsigned int xoffset=x<<1;
        vector unsigned char perm;
        vector signed short l1,vLumFilter;
        LOAD_FILTER(vLumFilter,filter);
        vLumFilter = vec_splat(vLumFilter, 0);
        LOAD_L1(l1,src[j],perm);
        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
    }

    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
    vs1 = vec_packsu(vo1, vo2);
    vs2 = vec_packsu(vo3, vo4);
    vf  = vec_packsu(vs1, vs2);
    VEC_ST(vf, 0, dest);
}
示例#6
0
文件: utils.c 项目: Jalle19/RetroArch
void audio_convert_float_to_s16_altivec(int16_t *out,
      const float *in, size_t samples)
{
   // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely).
   if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
   {
      size_t i;
      for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
      {
         vector float input0 = vec_ld( 0, in);
         vector float input1 = vec_ld(16, in);
         vector signed int result0 = vec_cts(input0, 15);
         vector signed int result1 = vec_cts(input1, 15);
         vec_st(vec_packs(result0, result1), 0, out);
      }

      audio_convert_float_to_s16_C(out, in, samples - i);
   }
   else
      audio_convert_float_to_s16_C(out, in, samples);
}
示例#7
0
static inline void
altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW)
{
    register int i;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));
    if ((uintptr_t)dest % 16) {
        /* badly aligned store, we force store alignment */
        /* and will handle load misalignment on val w/ vec_perm */
        vector unsigned char perm1;
        vector signed int v1;
        for (i = 0 ; (i < dstW) &&
            (((uintptr_t)dest + i) % 16) ; i++) {
                int t = val[i] >> 19;
                dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
        }
        perm1 = vec_lvsl(i << 2, val);
        v1 = vec_ld(i << 2, val);
        for ( ; i < (dstW - 15); i+=16) {
            int offset = i << 2;
            vector signed int v2 = vec_ld(offset + 16, val);
            vector signed int v3 = vec_ld(offset + 32, val);
            vector signed int v4 = vec_ld(offset + 48, val);
            vector signed int v5 = vec_ld(offset + 64, val);
            vector signed int v12 = vec_perm(v1, v2, perm1);
            vector signed int v23 = vec_perm(v2, v3, perm1);
            vector signed int v34 = vec_perm(v3, v4, perm1);
            vector signed int v45 = vec_perm(v4, v5, perm1);

            vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
            vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
            vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
            vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
            vector unsigned short vs1 = vec_packsu(vA, vB);
            vector unsigned short vs2 = vec_packsu(vC, vD);
            vector unsigned char vf = vec_packsu(vs1, vs2);
            vec_st(vf, i, dest);
            v1 = v5;
        }
    } else { // dest is properly aligned, great
示例#8
0
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1, i4;
    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        i1 = vec_ld(16, v2);
        t0 = vec_perm(i2, i1, align);
        i2 = vec_ld(32, v2);
        t1 = vec_perm(i1, i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        i4 = vec_ld(16, v3);
        t0 = vec_perm(i3, i4, align);
        i3 = vec_ld(32, v3);
        t1 = vec_perm(i4, i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        v2  += 8;
        v3  += 8;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}
示例#9
0
void float_to_int16_altivec(int16_t *dst, const float *src, int len)
{
    int i;
    vector float s0, s1;
    vector signed int t0, t1;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        d0 = vec_ld(0, dst+i);
        t1 = vec_cts(s1, 0);
        d1 = vec_ld(15, dst+i);
        d = vec_packs(t0,t1);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        t1 = vec_cts(s1, 0);
        d = vec_packs(t0,t1);
        vec_st(d, 0, dst+i);
    }
}
示例#10
0
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
    vec_s16 va0, va1, va2, va3;
    vec_s16 vz0, vz1, vz2, vz3;
    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
    vec_u8 va_u8;
    vec_u32 va_u32;
    vec_s16 vdst_ss;
    const vec_u16 v6us = vec_splat_u16(6);
    vec_u8 vdst, vdst_orig;
    vec_u8 vdst_mask = vec_lvsl(0, dst);
    int element = ((unsigned long)dst & 0xf) >> 2;
    LOAD_ZERO;

    block[0] += 32;  /* add 32 as a DC-level for rounding */

    vtmp0 = vec_ld(0,block);
    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
    vtmp2 = vec_ld(16,block);
    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
    memset(block, 0, 16 * sizeof(int16_t));

    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);

    va0 = vec_sra(va0,v6us);
    va1 = vec_sra(va1,v6us);
    va2 = vec_sra(va2,v6us);
    va3 = vec_sra(va3,v6us);

    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
    dst += stride;
    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
}
示例#11
0
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
{
    union {
        vector float v;
        float s[4];
    } mul_u;
    int i;
    vector float src1, src2, dst1, dst2, mul_v, zero;

    zero = (vector float)vec_splat_u32(0);
    mul_u.s[0] = mul;
    mul_v = vec_splat(mul_u.v, 0);

    for(i=0; i<len; i+=8) {
        src1 = vec_ctf(vec_ld(0,  src+i), 0);
        src2 = vec_ctf(vec_ld(16, src+i), 0);
        dst1 = vec_madd(src1, mul_v, zero);
        dst2 = vec_madd(src2, mul_v, zero);
        vec_st(dst1,  0, dst+i);
        vec_st(dst2, 16, dst+i);
    }
}
示例#12
0
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
{
    int i;
    register vec_s16_t vec, *pv;

    for(i = 0; i < order; i += 8){
        pv = (vec_s16_t*)v2;
        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
        vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
        v1 += 8;
        v2 += 8;
    }
}
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                            intptr_t blocksize)
{
    int i;
    vector float m, a;
    vector bool int t0, t1;
    const vector unsigned int v_31 = //XXX
        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
    for (i = 0; i < blocksize; i += 4) {
        m = vec_ld(0, mag+i);
        a = vec_ld(0, ang+i);
        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
        t0 = (vector bool int)vec_and(a, t1);
        t1 = (vector bool int)vec_andc(a, t1);
        a = vec_sub(m, (vector float)t1);
        m = vec_add(m, (vector float)t0);
        vec_stl(a, 0, ang+i);
        vec_stl(m, 0, mag+i);
    }
}
示例#14
0
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);


    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2 */
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_ld(0, pix1);
        t2 = vec_perm(pix2l, pix2r, perm);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
示例#15
0
static void vector_fmul_add_altivec(float *dst, const float *src0,
                                    const float *src1, const float *src2,
                                    int len)
{
    int i;
    vector float d, s0, s1, s2, t0, t1, edges;
    vector unsigned char align = vec_lvsr(0,dst),
                         mask = vec_lvsl(0, dst);

    for (i=0; i<len-3; i+=4) {
        t0 = vec_ld(0, dst+i);
        t1 = vec_ld(15, dst+i);
        s0 = vec_ld(0, src0+i);
        s1 = vec_ld(0, src1+i);
        s2 = vec_ld(0, src2+i);
        edges = vec_perm(t1 ,t0, mask);
        d = vec_madd(s0,s1,s2);
        t1 = vec_perm(d, edges, align);
        t0 = vec_perm(edges, d, align);
        vec_st(t1, 15, dst+i);
        vec_st(t0, 0, dst+i);
    }
}
示例#16
0
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
    int i;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        d0 = vec_ld(0, dst+i);
        d = float_to_int16_one_altivec(src+i);
        d1 = vec_ld(15, dst+i);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        d = float_to_int16_one_altivec(src+i);
        vec_st(d, 0, dst+i);
    }
}
int main (int argc, const char * argv[])
{
  int i;
  const float cf = 1.0;
  vector float v;
  const vector float cv = (vector float){1.0, 2.0, 3.0, 4.0};

  vec_dst(&cv, i, 0);
  v = vec_ld(0, &cv);	
  v = vec_lde(0, &cf);
  vec_lvsl(0, &cf);
  
  return 0;
}
示例#18
0
static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride )
{
    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;

    vec_u8 perm_ldv = vec_lvsl(0, dst);
    vec_u8 perm_stv = vec_lvsr(8, dst);

    const vec_u16 onev = vec_splat_u16(1);
    const vec_u16 twov = vec_splat_u16(2);
    const vec_u16 sixv = vec_splat_u16(6);

    const vec_u8 sel = (vec_u8)
    {
        0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1
    };
    LOAD_ZERO;

    dct[0] += 32; // rounding for the >>6 at the end

    s0 = vec_ld(0x00, (int16_t *)dct);
    s1 = vec_ld(0x10, (int16_t *)dct);
    s2 = vec_ld(0x20, (int16_t *)dct);
    s3 = vec_ld(0x30, (int16_t *)dct);
    s4 = vec_ld(0x40, (int16_t *)dct);
    s5 = vec_ld(0x50, (int16_t *)dct);
    s6 = vec_ld(0x60, (int16_t *)dct);
    s7 = vec_ld(0x70, (int16_t *)dct);

    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
                     d0, d1, d2, d3, d4, d5, d6, d7);

    TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );

    IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);

    ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
}
static int a52_resample_STEREO_to_2_altivec(float * _f, int16_t * s16){
#if 0
  int i;
  int32_t * f = (int32_t *) _f;
  for (i = 0; i < 256; i++) {
    s16[2*i] = convert (f[i]);
    s16[2*i+1] = convert (f[i+256]);
  }
  return 2*256;
#else
  int i = 0;
  int32_t * f = (int32_t *) _f;
  register vector signed int f0, f4, f256, f260;
  register vector signed short reven, rodd, r0, r1;

  for (i = 0; i < 256; i+= 8) {
    f0 = vec_ld(0, f);
    f4 = vec_ld(16, f);

    f256 = vec_ld(1024, f);
    f260 = vec_ld(1040, f);

    reven = convert16_altivec(f0, f4);
    rodd = convert16_altivec(f256, f260);

    r0 = vec_mergeh(reven, rodd);
    r1 = vec_mergel(reven, rodd);
    // FIXME can be merged to spare some I/O
    unaligned_store(r0, 0, s16);
    unaligned_store(r1, 16, s16);

    f += 8;
    s16 += 16;
  }
  return(2*256);
#endif
}
示例#20
0
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
{
    union {
        vector float v;
        float s[4];
    } vadd;
    vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
    const vector unsigned char reverse = vcprm(3,2,1,0);
    int i,j;

    dst += len;
    win += len;
    src0+= len;

    vadd.s[0] = add_bias;
    vadd_bias = vec_splat(vadd.v, 0);
    zero = (vector float)vec_splat_u32(0);

    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
        s0 = vec_ld(i, src0);
        s1 = vec_ld(j, src1);
        wi = vec_ld(i, win);
        wj = vec_ld(j, win);

        s1 = vec_perm(s1, s1, reverse);
        wj = vec_perm(wj, wj, reverse);

        t0 = vec_madd(s0, wj, vadd_bias);
        t0 = vec_nmsub(s1, wi, t0);
        t1 = vec_madd(s0, wi, vadd_bias);
        t1 = vec_madd(s1, wj, t1);
        t1 = vec_perm(t1, t1, reverse);

        vec_st(t0, i, dst);
        vec_st(t1, j, dst);
    }
}
示例#21
0
文件: logf.c 项目: K1773R/libfreevec
float expf(float x) {
#else
float vexpf(float x) {
#endif
  vector float vexpa, va;//, vx, vn, va, vb,
//               v0, vlog2, vln2;
  
  register float exp, a;//, b, b2, b3, b4;//, b6, b8, b10, R0, R;
  float __attribute__((aligned(16))) xa[4];
  xa[0] = x;

/*  // set up a few constants
  vlog2 = vec_ld(0, &C_EXPF[0]);
  v0    = (vector float) vec_splat_u32(0);
  vln2  = vec_splat(vlog2, 1);
  vlog2 = vec_splat(vlog2, 0);

  // Load x into a vector float
  vx = vec_ld(0, xa);
  vx = vec_splat(vx, 0);

  // Split x = n*log2e + b
  vn = vec_madd(vx, vlog2e, v0);
  vn = vec_floor(vn);*/
    
  
  xa[0] = truncf(x*M_LOG2E);
  va = vec_ld(0, xa);
  vexpa = vec_expte(va);
  a = xa[0] * M_LN2;
  vec_st(vexpa, 0, xa);
/*
  b = x - a;
  b2 = b*b;
  b3 = b2*b;
  b4 = b2*b2;
  b6 = b4*b2;
  b8 = b6*b2;
  b10 = b8*b2;
  R0 =       0.1666666666666666019037   *b2 - 0.00277777777770155933842  *b4
           + 6.61375632143793436117e-05 *b6 - 1.65339022054652515390e-06 *b8
           + 4.13813679705723846039e-08 *b10;
  R = b - R0;
  //exp = 1.0 + 2.0*b/(2.0 - R);
  exp = (1680.0 + 840*b + 180*b2 + 20*b3 + b4)/(1680 - 840*b + 180*b2 - 20*b3 + b4);
*/
  exp = xa[0];
  return exp;
}
示例#22
0
void imageFilterSubFrom_Altivec(unsigned char *dst, unsigned char *src, int length)
{
    int n = length;

    // Compute first few values so we're on a 16-byte boundary in dst
    while( (((long)dst & 0xF) > 0) && (n > 0) ) {
        SUBFROM_PIXEL();
        --n; ++dst; ++src;
    }

    // Do bulk of processing using Altivec (sub 16 8-bit unsigned integers, with saturation)
    while(n >= 16) {
        vector unsigned char s = vec_ld(0,src);
        vector unsigned char d = vec_ld(0,dst);
        vector unsigned char r = vec_subs(d, s);
        vec_st(r,0,dst);

        n -= 16; src += 16; dst += 16;
    }

    // If any bytes are left over, deal with them individually
    ++n;
    BASIC_SUBFROM();
}
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                    const int16_t *v2,
                                                    const int16_t *v3,
                                                    int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16 *) v1;
    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
    register vec_s32 res = zero_s32v;
#if HAVE_BIGENDIAN
    register vec_u8 align = vec_lvsl(0, v2);
    i2 = vec_ld(0, v2);
    i3 = vec_ld(0, v3);
#endif
    int32_t ires;

    order >>= 4;
    do {
        GET_T(t0,t1,v2,i1,i2);
        i0     = pv1[0];
        i1     = pv1[1];
        res    = vec_msum(t0, i0, res);
        res    = vec_msum(t1, i1, res);
        GET_T(t0,t1,v3,i4,i3);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1   += 2;
        v2    += 16;
        v3    += 16;
    } while (--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);

    return ires;
}
示例#24
0
/* Place the content of the array of structures
   in vectors x_vec, y_vec, z_vec, and t_vec */
int main(int argc, char **argv) {
   vector float x_vec, y_vec, z_vec, 
      t_vec, hold[4], tmp[4];
   
   /* Load structures into vectors */
   hold[0] = vec_ld(0, (float*)p_motion);
   hold[1] = vec_ld(0, (float*)&p_motion[1]);   
   hold[2] = vec_ld(0, (float*)&p_motion[2]);   
   hold[3] = vec_ld(0, (float*)&p_motion[3]);
   
   /* Perform first step of the swizzle */
   tmp[0] = vec_mergeh(hold[0], hold[2]);
   tmp[1] = vec_mergeh(hold[1], hold[3]);
   tmp[2] = vec_mergel(hold[0], hold[2]);
   tmp[3] = vec_mergel(hold[1], hold[3]);
   
   /* Perform second step of the swizzle */
   x_vec = vec_mergeh(tmp[0], tmp[1]);
   y_vec = vec_mergel(tmp[0], tmp[1]);
   z_vec = vec_mergeh(tmp[2], tmp[3]);
   t_vec = vec_mergel(tmp[2], tmp[3]);

   return 0;
}
示例#25
0
文件: dct.c 项目: 0x0B501E7E/x264
void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] )
{
    vec_u16_t onev = vec_splat_u16(1);
    vec_u16_t twov = vec_splat_u16(2);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;

    s0 = vec_ld(0x00, dct);
    s1 = vec_ld(0x10, dct);
    s2 = vec_ld(0x20, dct);
    s3 = vec_ld(0x30, dct);
    s4 = vec_ld(0x40, dct);
    s5 = vec_ld(0x50, dct);
    s6 = vec_ld(0x60, dct);
    s7 = vec_ld(0x70, dct);

    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7);

    vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7;

    VEC_TRANSPOSE_8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
                    tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7);

    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
    IDCT8_1D_ALTIVEC(tr0,     tr1,   tr2,   tr3,   tr4,   tr5,   tr6,   tr7,
                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);

    vec_u8_t perm_ldv = vec_lvsl(0, dst);
    vec_u8_t perm_stv = vec_lvsr(8, dst);
    vec_u16_t sixv = vec_splat_u16(6);
    const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
    LOAD_ZERO;

    ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6, perm_ldv, perm_stv, sel);
    ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7, perm_ldv, perm_stv, sel);
}
void
b()
{
  z = vec_add (x, y);

  /* Make sure the predicates accept correct argument types.  */

  int1 = vec_all_in (f, g);
  int1 = vec_all_ge (f, g);
  int1 = vec_all_eq (c, d);
  int1 = vec_all_ne (s, t);
  int1 = vec_any_eq (i, j);
  int1 = vec_any_ge (f, g);
  int1 = vec_all_ngt (f, g);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);

  vec_mtvscr (i);
  vec_dssall ();
  s = (vector signed short) vec_mfvscr ();
  vec_dss (3);

  vec_dst (pi, int1 + int2, 3);
  vec_dstst (pi, int1 + int2, 3);
  vec_dststt (pi, int1 + int2, 3);
  vec_dstt (pi, int1 + int2, 3);

  uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi);
  uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi);

  c = vec_lde (int1, (signed char *) pi);
  s = vec_lde (int1, (signed short *) pi);
  i = vec_lde (int1, (signed int *) pi);
  i = vec_ldl (int1, pi);
  i = vec_ld (int1, pi);

  vec_st (i, int2, pi);
  vec_ste (c, int2, (signed char *) pi);
  vec_ste (s, int2, (signed short *) pi);
  vec_ste (i, int2, (signed int *) pi);
  vec_stl (i, int2, pi);
}
示例#27
0
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        vec1 = vec_unaligned_load(v1);
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
示例#28
0
/* next one assumes that ((line_size % 16) == 0) */
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
    register vector unsigned char pixelsv1, pixelsv2;
    register vector unsigned char pixelsv1B, pixelsv2B;
    register vector unsigned char pixelsv1C, pixelsv2C;
    register vector unsigned char pixelsv1D, pixelsv2D;

    register vector unsigned char perm = vec_lvsl(0, pixels);
    int i;
    register ptrdiff_t line_size_2 = line_size << 1;
    register ptrdiff_t line_size_3 = line_size + line_size_2;
    register ptrdiff_t line_size_4 = line_size << 2;

// hand-unrolling the loop by 4 gains about 15%
// mininum execution time goes from 74 to 60 cycles
// it's faster than -funroll-loops, but using
// -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450
    for (i = 0; i < h; i += 4) {
        pixelsv1  = vec_ld( 0, pixels);
        pixelsv2  = vec_ld(15, pixels);
        pixelsv1B = vec_ld(line_size, pixels);
        pixelsv2B = vec_ld(15 + line_size, pixels);
        pixelsv1C = vec_ld(line_size_2, pixels);
        pixelsv2C = vec_ld(15 + line_size_2, pixels);
        pixelsv1D = vec_ld(line_size_3, pixels);
        pixelsv2D = vec_ld(15 + line_size_3, pixels);
        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
               0, (unsigned char*)block);
        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
               line_size, (unsigned char*)block);
        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
               line_size_2, (unsigned char*)block);
        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
               line_size_3, (unsigned char*)block);
        pixels+=line_size_4;
        block +=line_size_4;
    }
}
示例#29
0
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
{
    register vector unsigned char pixelsv1, pixelsv2;
    register vector unsigned char pixelsv1B, pixelsv2B;
    register vector unsigned char pixelsv1C, pixelsv2C;
    register vector unsigned char pixelsv1D, pixelsv2D;

    register vector unsigned char perm = vec_lvsl(0, src);
    int i;
    register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
    register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
    register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;

// hand-unrolling the loop by 4 gains about 15%
// mininum execution time goes from 74 to 60 cycles
// it's faster than -funroll-loops, but using
// -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450
    for (i = 0; i < h; i += 4) {
        pixelsv1  = vec_ld( 0, src);
        pixelsv2  = vec_ld(15, src);
        pixelsv1B = vec_ld(sstride, src);
        pixelsv2B = vec_ld(15 + sstride, src);
        pixelsv1C = vec_ld(sstride2, src);
        pixelsv2C = vec_ld(15 + sstride2, src);
        pixelsv1D = vec_ld(sstride3, src);
        pixelsv2D = vec_ld(15 + sstride3, src);
        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
               0, (unsigned char*)dst);
        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
               dstride, (unsigned char*)dst);
        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
               dstride2, (unsigned char*)dst);
        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
               dstride3, (unsigned char*)dst);
        src += sstride4;
        dst += dstride4;
    }
}
示例#30
0
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {

    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
        register vector unsigned char p2 = vec_ld(-3*stride, pix);
        register vector unsigned char p1 = vec_ld(-2*stride, pix);
        register vector unsigned char p0 = vec_ld(-1*stride, pix);
        register vector unsigned char q0 = vec_ld(0, pix);
        register vector unsigned char q1 = vec_ld(stride, pix);
        register vector unsigned char q2 = vec_ld(2*stride, pix);
        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
        vec_st(p1, -2*stride, pix);
        vec_st(p0, -1*stride, pix);
        vec_st(q0, 0, pix);
        vec_st(q1, stride, pix);
    }
}