예제 #1
파일: idct_altivec.c 프로젝트: 9aa5/FFmpeg
void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
    vec_s16 *block = (vec_s16*)blk;
    vec_u8 tmp;
    vec_s16 tmp2, tmp3;
    vec_u8 perm0;
    vec_u8 perm1;
    vec_u8 p0, p1, p;


    p0 = vec_lvsl (0, dest);
    p1 = vec_lvsl (stride, dest);
    p = vec_splat_u8 (-1);
    perm0 = vec_mergeh (p, p0);
    perm1 = vec_mergeh (p, p1);

#define ADD(dest,src,perm)                                              \
    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
    tmp = vec_ld (0, dest);                                             \
    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
    tmp3 = vec_adds (tmp2, src);                                        \
    tmp = vec_packsu (tmp3, tmp3);                                      \
    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);

    ADD (dest, vx0, perm0)      dest += stride;
    ADD (dest, vx1, perm1)      dest += stride;
    ADD (dest, vx2, perm0)      dest += stride;
    ADD (dest, vx3, perm1)      dest += stride;
    ADD (dest, vx4, perm0)      dest += stride;
    ADD (dest, vx5, perm1)      dest += stride;
    ADD (dest, vx6, perm0)      dest += stride;
    ADD (dest, vx7, perm1)
예제 #2
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    for(i=0;i<16;i++) {
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
예제 #3
/* next one assumes that ((line_size % 8) == 0) */
static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
    register int i;
    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
    register vector unsigned char blockv, temp1, temp2;
    register vector unsigned short pixelssum1, pixelssum2, temp3;
    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);

    temp1 = vec_ld(0, pixels);
    temp2 = vec_ld(16, pixels);
    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
        pixelsv2 = temp2;
    } else {
        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
    pixelsv1 = vec_mergeh(vczero, pixelsv1);
    pixelsv2 = vec_mergeh(vczero, pixelsv2);
    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                         (vector unsigned short)pixelsv2);
    pixelssum1 = vec_add(pixelssum1, vcone);

    for (i = 0; i < h ; i++) {
        int rightside = ((unsigned long)block & 0x0000000F);
        blockv = vec_ld(0, block);

        temp1 = vec_ld(line_size, pixels);
        temp2 = vec_ld(line_size + 16, pixels);
        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
            pixelsv2 = temp2;
        } else {
            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

        pixelsv1 = vec_mergeh(vczero, pixelsv1);
        pixelsv2 = vec_mergeh(vczero, pixelsv2);
        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                             (vector unsigned short)pixelsv2);
        temp3 = vec_add(pixelssum1, pixelssum2);
        temp3 = vec_sra(temp3, vctwo);
        pixelssum1 = vec_add(pixelssum2, vcone);
        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);

        if (rightside) {
            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
        } else {
            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));

        vec_st(blockv, 0, block);

        block += line_size;
        pixels += line_size;
예제 #4
sad16bi_altivec_c(vector unsigned char *cur,
                        vector unsigned char *ref1,
                        vector unsigned char *ref2,
                        uint32_t stride)
    vector unsigned char t1, t2;
    vector unsigned char mask1, mask2;
    vector unsigned char sad;
    vector unsigned int sum;
    uint32_t result;
#ifdef DEBUG
    /* print alignment errors if this is on */
    if((long)cur & 0xf)
        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur);
    if(stride & 0xf)
        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride);
    /* Initialisation stuff */
    stride >>= 4;
    mask1 = vec_lvsl(0, (unsigned char*)ref1);
    mask2 = vec_lvsl(0, (unsigned char*)ref2);
    sad = vec_splat_u8(0);
    sum = (vector unsigned int)sad;
    sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0));
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, (uint32_t*)&result);
    return result;
예제 #5
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the pix_abs8x8_altivec code above w/ squaring added.
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;
    sum = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

          Since we want to use unsigned chars, we can take advantage
          of the fact that abs(a-b)^2 = (a-b)^2.
	/* Calculate abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);
        pix1 += line_size;
        pix2 += line_size;
    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);
    return s;
예제 #6
int pix_norm1_altivec(uint8_t *pix, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;
    sv = (vector unsigned int)vec_splat_u32(0);
    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        tv = (vector unsigned char *) pix;
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
static void _twin_fbdev_vec_put_span (twin_coord_t    left,
				      twin_coord_t    top,
				      twin_coord_t    right,
				      twin_argb32_t   *pixels,
				      void     	      *closure)
	twin_fbdev_t    	*tf = closure;
	twin_coord_t    	width = right - left;
	unsigned int		*dest;
	vector unsigned char 	edgeperm;
	vector unsigned char	src0v, src1v, srcv;

	if (!tf->active || tf->fb_base == MAP_FAILED)

	dest = (unsigned int *)(tf->fb_ptr + top * tf->fb_fix.line_length);
	dest += left;

	while((((unsigned long)dest) & 0xf) && width--)
		*(dest++) = *(pixels++);

	edgeperm = vec_lvsl (0, pixels);
	src0v = vec_ld (0, pixels);
	while(width >= 4) {
		src1v = vec_ld (16, pixels);
		srcv = vec_perm (src0v, src1v, edgeperm);
		vec_st ((vector unsigned int)srcv, 0, dest);
		src0v = src1v;
		dest += 4;
		pixels += 4;
		width -= 4;
		*(dest++) = *(pixels++);
예제 #8
void ff_vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
    vec_u8 t, vdst;
    vec_s16 vdst_16;
    vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));


    TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);

#define ADD(a)\
    vdst = vec_ld(0, dst);\
    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
    vdst_16 = vec_adds(a, vdst_16);\
    t = vec_packsu(vdst_16, vdst_16);\
    vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
    vec_ste((vec_u32)t, 4, (unsigned int *)dst);

    ADD(b0)     dst += stride;
    ADD(b1)     dst += stride;
    ADD(b2)     dst += stride;
    ADD(b3)     dst += stride;
    ADD(b4)     dst += stride;
    ADD(b5)     dst += stride;
    ADD(b6)     dst += stride;
예제 #9
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
    int i;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst) & 15) //FIXME
        for(i = 0; i < len - 7; i += 8)
            d0 = vec_ld(0, dst + i);
            d = float_to_int16_one_altivec(src + i);
            d1 = vec_ld(15, dst + i);
            d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
            align = vec_lvsr(0, dst + i);
            d0 = vec_perm(d1, d, align);
            d1 = vec_perm(d, d1, align);
            vec_st(d0, 0, dst + i);
            vec_st(d1, 15, dst + i);
        for(i = 0; i < len - 7; i += 8)
            d = float_to_int16_one_altivec(src + i);
            vec_st(d, 0, dst + i);
예제 #10
파일: dct.c 프로젝트: 0x0B501E7E/x264
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] )
    vec_u16_t onev = vec_splat_u16(1);

    dct[0] += 32; // rounding for the >>6 at the end

    vec_s16_t s0, s1, s2, s3;

    s0 = vec_ld( 0x00, dct );
    s1 = vec_sld( s0, s0, 8 );
    s2 = vec_ld( 0x10, dct );
    s3 = vec_sld( s2, s2, 8 );

    vec_s16_t d0, d1, d2, d3;
    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );

    vec_s16_t tr0, tr1, tr2, tr3;

    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );

    vec_s16_t idct0, idct1, idct2, idct3;
    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );

    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
    vec_u16_t sixv = vec_splat_u16(6);

    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
예제 #11
void transfer_16to8copy_altivec_c(uint8_t *dst,
                            vector signed short *src,
                            uint32_t stride)
    register vector signed short s;
    register vector unsigned char packed;
    register vector unsigned char mask_stencil;
    register vector unsigned char mask;
    register vector unsigned char load_src_perm;
#ifdef DEBUG
    /* if this is on, print alignment errors */
    if(((unsigned long) dst) & 0x7)
        fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, dst %lx\n", (long)dst);
    if(stride & 0x7)
        fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, stride %u\n", stride);
    /* Initialisation stuff */
    load_src_perm = vec_lvsl(0, (unsigned char*)src);
    mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
예제 #12
static int pix_norm1_altivec(uint8_t *pix, int line_size)
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
예제 #13
void float_to_int16_altivec(int16_t *dst, const float *src, int len)
    int i;
    vector float s0, s1;
    vector signed int t0, t1;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        d0 = vec_ld(0, dst+i);
        t1 = vec_cts(s1, 0);
        d1 = vec_ld(15, dst+i);
        d = vec_packs(t0,t1);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        t1 = vec_cts(s1, 0);
        d = vec_packs(t0,t1);
        vec_st(d, 0, dst+i);
예제 #14
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
    vec_s16 *pv1 = (vec_s16*)v1;
    vec_s16 *pv2 = (vec_s16*)v2;
    vec_s16 *pv3 = (vec_s16*)v3;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1;
    register vec_s16 i2 = pv2[0], i3 = pv3[0];
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        t0 = vec_perm(i2, pv2[1], align);
        i2 = pv2[2];
        t1 = vec_perm(pv2[1], i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        t0 = vec_perm(i3, pv3[1], align);
        i3 = pv3[2];
        t1 = vec_perm(pv3[1], i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        pv2 += 2;
        pv3 += 2;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
예제 #15
static int pix_sum_altivec(uint8_t * pix, int line_size)
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char t1;
    vector unsigned int sad;
    vector signed int sumdiffs;

    int i;
    int s;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1 */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        t1 = vec_perm(pixl, pixr, perm);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t1, sad);

        pix += line_size;

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
예제 #16
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
    int i;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
예제 #17
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
    int i;

   for (i = 0; i < h; i++) {
       /* block is 8 bytes-aligned, so we're either in the
          left block (16 bytes-aligned) or in the right block (not) */
       int rightside = ((unsigned long)block & 0x0000000F);

       blockv = vec_ld(0, block);
       pixelsv1 = vec_ld( 0, pixels);
       pixelsv2 = vec_ld(16, pixels);
       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

       if (rightside) {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
       } else {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));

       blockv = vec_avg(blockv, pixelsv);

       vec_st(blockv, 0, block);

       pixels += line_size;
       block += line_size;
예제 #18
static av_always_inline
void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                 uint8_t *src, ptrdiff_t src_stride,
                                 int h, int mx, int w, int is6tap)
    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
    vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
    vec_u8 a, b, pixh, pixl, outer;
    vec_s16 f16h, f16l;
    vec_s32 filth, filtl;

    vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
    vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
    vec_u8 perm_inner  = is6tap ? perm_inner6 : perm_inner4;
    vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
    vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
    vec_u16 c7  = vec_splat_u16(7);

    align_vec0 = vec_lvsl( -is6tap-1, src);
    align_vec8 = vec_lvsl(8-is6tap-1, src);

    permh0     = vec_perm(align_vec0, align_vec0, perm_inner);
    permh8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_inner = vec_add(perm_inner, vec_splat_u8(4));
    perml0     = vec_perm(align_vec0, align_vec0, perm_inner);
    perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
    perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);

    while (h --> 0) {
        FILTER_H(f16h, 0);

        if (w == 16) {
            FILTER_H(f16l, 8);
            filt = vec_packsu(f16h, f16l);
            vec_st(filt, 0, dst);
        } else {
            filt = vec_packsu(f16h, f16h);
            vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
            if (w == 8)
                vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
        src += src_stride;
        dst += dst_stride;
예제 #19
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for(i=0;i<16;i++) {
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]	pix2iv: pix2[1]-pix2[16]
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
        tv = (vector unsigned char *) &pix2[0];
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));

        tv = (vector unsigned char *) &pix2[1];
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        pix1 += line_size;
        pix2 += line_size;
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
예제 #20
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
예제 #21
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                                  long len, int channels)
    int i;
    vector signed short d0, d1, d2, c0, c1, t0, t1;
    vector unsigned char align;
    if(channels == 1)
        float_to_int16_altivec(dst, src[0], len);
    else if (channels == 2)
        if(((long)dst) & 15)
            for(i = 0; i < len - 7; i += 8)
                d0 = vec_ld(0, dst + i);
                t0 = float_to_int16_one_altivec(src[0] + i);
                d1 = vec_ld(31, dst + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                c0 = vec_mergeh(t0, t1);
                c1 = vec_mergel(t0, t1);
                d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
                align = vec_lvsr(0, dst + i);
                d0 = vec_perm(d2, c0, align);
                d1 = vec_perm(c0, c1, align);
                vec_st(d0,  0, dst + i);
                d0 = vec_perm(c1, d2, align);
                vec_st(d1, 15, dst + i);
                vec_st(d0, 31, dst + i);
                dst += 8;
            for(i = 0; i < len - 7; i += 8)
                t0 = float_to_int16_one_altivec(src[0] + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                d0 = vec_mergeh(t0, t1);
                d1 = vec_mergel(t0, t1);
                vec_st(d0,  0, dst + i);
                vec_st(d1, 16, dst + i);
                dst += 8;
        DECLARE_ALIGNED(16, int16_t, tmp)[len];
        int c, j;
        for (c = 0; c < channels; c++)
            float_to_int16_altivec(tmp, src[c], len);
            for (i = 0, j = c; i < len; i++, j += channels)
                dst[j] = tmp[i];
예제 #22
sad16_altivec_c(vector unsigned char *cur,
			  vector unsigned char *ref,
			  uint32_t stride,
			  const uint32_t best_sad)
	vector unsigned char perm;
	vector unsigned char t1, t2;
	vector unsigned int sad;
	vector unsigned int sumdiffs;
	vector unsigned int best_vec;
	uint32_t result;

#ifdef DEBUG
        /* print alignment errors if DEBUG is on */
	if (((unsigned long) cur) & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur);
	if (stride & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride);
	/* initialization */
	sad = vec_splat_u32(0);
	sumdiffs = sad;
	stride >>= 4;
	perm = vec_lvsl(0, (unsigned char *) ref);
	*((uint32_t*)&best_vec) = best_sad;
	best_vec = vec_splat(best_vec, 0);

	/* perform sum of differences between current and previous */



	/* copy vector sum into unaligned result */
	sumdiffs = vec_splat(sumdiffs, 3);
	vec_ste(sumdiffs, 0, (uint32_t*) &result);
	return result;
예제 #23
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
  vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; 
  vector signed int sumdiffs;
  int tmpsad, sad = 0;
  int y, x;
  for (y = 0; y < height; ++y) {
    vector unsigned char perm1, perm2;
    perm1 = vec_lvsl(0, &data1[y * stride1]);
    perm2 = vec_lvsl(0, &data2[y * stride2]);
    for (x = 0; x <= width-16; x+=16) {
      vector unsigned char t1, t2, t3, t4, t5;
      vector unsigned char *current, *previous;
      current = (vector unsigned char *) &data1[y * stride1 + x];
      previous = (vector unsigned char *) &data2[y * stride2 + x];
      t1  = vec_perm(current[0], current[1], perm1 );  /* align current vector  */ 
      t2  = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ 
      t3  = vec_max(t1, t2 );      /* find largest of two           */ 
      t4  = vec_min(t1, t2 );      /* find smaller of two           */ 
      t5  = vec_sub(t3, t4);       /* find absolute difference      */ 
      vsad = vec_sum4s(t5, vsad);    /* accumulate sum of differences */

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
  sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero);
  /* copy vector sum into unaligned result */
  sumdiffs = vec_splat( sumdiffs, 3);
  vec_ste( sumdiffs, 0, &tmpsad );
  sad += tmpsad;
  return sad;
예제 #24
static inline void avg_pixels16_l2_altivec( uint8_t *dst, const uint8_t *src1,
        const uint8_t *src2, int dst_stride,
        int src_stride1, int h)
    int i;
    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;

    mask_ = vec_lvsl(0, src2);

    for (i = 0; i < h; i++)

        tmp1 = vec_ld(i * src_stride1, src1);
        mask = vec_lvsl(i * src_stride1, src1);
        tmp2 = vec_ld(i * src_stride1 + 15, src1);

        a = vec_perm(tmp1, tmp2, mask);

        tmp1 = vec_ld(i * 16, src2);
        tmp2 = vec_ld(i * 16 + 15, src2);

        b = vec_perm(tmp1, tmp2, mask_);

        tmp1 = vec_ld(0, dst);
        mask = vec_lvsl(0, dst);
        tmp2 = vec_ld(15, dst);

        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));

        edges = vec_perm(tmp2, tmp1, mask);

        align = vec_lvsr(0, dst);

        tmp2 = vec_perm(d, edges, align);
        tmp1 = vec_perm(edges, d, align);

        vec_st(tmp2, 15, dst);
        vec_st(tmp1, 0 , dst);

        dst += dst_stride;
예제 #25
파일: swscale_altivec.c 프로젝트: 1c0n/xbmc
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
                                  const int16_t **src, uint8_t *dest,
                                  const uint8_t *dither, int offset, int x)
    register int i, j;
    DECLARE_ALIGNED(16, int, val)[16];
    vector signed int vo1, vo2, vo3, vo4;
    vector unsigned short vs1, vs2;
    vector unsigned char vf;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));

    for (i = 0; i < 16; i++)
        val[i] = dither[(x + i + offset) & 7] << 12;

    vo1 = vec_ld(0,  val);
    vo2 = vec_ld(16, val);
    vo3 = vec_ld(32, val);
    vo4 = vec_ld(48, val);

    for (j = 0; j < filterSize; j++) {
        vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
        vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
        vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
        vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter

        perm = vec_lvsl(x << 1, src[j]);
        l1   = vec_ld(x << 1, src[j]);

        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);

    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
    vs1 = vec_packsu(vo1, vo2);
    vs2 = vec_packsu(vo3, vo4);
    vf  = vec_packsu(vs1, vs2);
    vec_st(vf, 0, dest);
예제 #26
void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
    vec_s16 *block = (vec_s16*)blk;
    vec_u8 tmp;
    vec_s16 tmp2, tmp3;
    vec_u8 perm0;
    vec_u8 perm1;
    vec_u8 p0, p1, p;

POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);


    p0 = vec_lvsl (0, dest);
    p1 = vec_lvsl (stride, dest);
    p = vec_splat_u8 (-1);
    perm0 = vec_mergeh (p, p0);
    perm1 = vec_mergeh (p, p1);

#define ADD(dest,src,perm)                                              \
    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
    tmp = vec_ld (0, dest);                                             \
    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
    tmp3 = vec_adds (tmp2, src);                                        \
    tmp = vec_packsu (tmp3, tmp3);                                      \
    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);

    ADD (dest, vx0, perm0)      dest += stride;
    ADD (dest, vx1, perm1)      dest += stride;
    ADD (dest, vx2, perm0)      dest += stride;
    ADD (dest, vx3, perm1)      dest += stride;
    ADD (dest, vx4, perm0)      dest += stride;
    ADD (dest, vx5, perm1)      dest += stride;
    ADD (dest, vx6, perm0)      dest += stride;
    ADD (dest, vx7, perm1)

POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
예제 #27
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    sad = (vector unsigned int)vec_splat_u32(0);

    for(i=0;i<16;i++) {
	/* Read potentially unaligned pixels into t1 and t2 */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;
예제 #28
파일: lvsl.c 프로젝트: Artem-B/test-suite
static void test() {
  vector unsigned char expected = {4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19};
  check (vec_all_eq(vec_lvsl(0, &sc[4]), expected), "sc");
  check (vec_all_eq(vec_lvsl(0, &uc[4]), expected), "uc");
  check (vec_all_eq(vec_lvsl(0, &ss[2]), expected), "ss");
  check (vec_all_eq(vec_lvsl(0, &us[2]), expected), "us");
  check (vec_all_eq(vec_lvsl(0, &si[1]), expected), "si");
  check (vec_all_eq(vec_lvsl(0, &ui[1]), expected), "ui");
  check (vec_all_eq(vec_lvsl(0, & f[1]), expected), "f");
예제 #29
/* Load a vector from an unaligned location in memory */
static inline vector unsigned char
LoadUnaligned(const guchar *v)
  if ((long)v & 0x0f)
      vector unsigned char permuteVector = vec_lvsl(0, v);
      vector unsigned char low = vec_ld(0, v);
      vector unsigned char high = vec_ld(16, v);
      return vec_perm(low, high, permuteVector);
    return vec_ld(0, v); /* don't want overflow */
예제 #30
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
    int i;
    register vec_s16_t vec, *pv;

    for(i = 0; i < order; i += 8){
        pv = (vec_s16_t*)v2;
        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
        vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
        v1 += 8;
        v2 += 8;