Пример #1
0
static void FUNC(ff_hevc_idct_4x4, BIT_DEPTH)(int16_t *coeffs, int col_limit)
{
    const int shift = 7;
    const int shift2 = 20 - BIT_DEPTH;
    vec_s16 src_01, src_23;
    vec_s32 res[4];
    vec_s16 res_packed[2];

    src_01 = vec_ld(0, coeffs);
    src_23 = vec_ld(16, coeffs);

    transform4x4(src_01, src_23, res, shift, coeffs);
    src_01 = vec_packs(res[0], res[1]);
    src_23 = vec_packs(res[2], res[3]);
    scale(res, res_packed, shift);
    // transpose
    src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]);
    src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]);

    transform4x4(src_01, src_23, res, shift2, coeffs);
    scale(res, res_packed, shift2);
    // transpose
    src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]);
    src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]);

    vec_st(src_01, 0, coeffs);
    vec_st(src_23, 16, coeffs);
}
Пример #2
0
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
{
    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
    int i;

   for (i = 0; i < h; i++) {
       /* block is 8 bytes-aligned, so we're either in the
          left block (16 bytes-aligned) or in the right block (not) */
       int rightside = ((unsigned long)block & 0x0000000F);

       blockv = vec_ld(0, block);
       pixelsv1 = vec_ld( 0, pixels);
       pixelsv2 = vec_ld(16, pixels);
       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

       if (rightside) {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
       } else {
           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
       }

       blockv = vec_avg(blockv, pixelsv);

       vec_st(blockv, 0, block);

       pixels += line_size;
       block += line_size;
   }
}
Пример #3
0
static inline void do_recursion(w128_t *r, w128_t *a, w128_t * b,
                                w128_t * lung) {
  const vector unsigned char sl1 = ALTI_SL1;
  const vector unsigned char sl1_perm = ALTI_SL1_PERM;
  const vector unsigned int sl1_msk = ALTI_SL1_MSK;
  const vector unsigned char sr1 = ALTI_SR;
  const vector unsigned char sr1_perm = ALTI_SR_PERM;
  const vector unsigned int sr1_msk = ALTI_SR_MSK;
  const vector unsigned char perm = ALTI_PERM;
  const vector unsigned int msk1 = ALTI_MSK;

  vector unsigned int z = a->s;
  vector unsigned int w = lung->s;
  vector unsigned int x = vec_perm(w, (vector unsigned int)perm, perm);
  vector unsigned int y = vec_perm(z, (vector unsigned int)sl1_perm, sl1_perm);
  y = vec_sll(y, sl1);
  y = vec_and(y, sl1_msk);
  w = vec_xor(x, b->s);
  w = vec_xor(w, y);
  x = vec_perm(w, (vector unsigned int)sr1_perm, sr1_perm);
  x = vec_srl(x, sr1);
  x = vec_and(x, sr1_msk);
  y = vec_and(w, msk1);
  z = vec_xor(z, y);
  r->s = vec_xor(z, x);
  lung->s = w;
}
Пример #4
0
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    vec_s16 *pv2 = (vec_s16*)v2;
    vec_s16 *pv3 = (vec_s16*)v3;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1;
    register vec_s16 i2 = pv2[0], i3 = pv3[0];
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        t0 = vec_perm(i2, pv2[1], align);
        i2 = pv2[2];
        t1 = vec_perm(pv2[1], i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        t0 = vec_perm(i3, pv3[1], align);
        i3 = pv3[2];
        t1 = vec_perm(pv3[1], i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        pv2 += 2;
        pv3 += 2;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Пример #5
0
/* Store a vector to an unaligned location in memory */
static inline void
StoreUnaligned (vector unsigned char v,
                 const guchar *where)
{
  if ((unsigned long)where & 0x0f)
    {
      /* Load the surrounding area */
      vector unsigned char low = vec_ld(0, where);
      vector unsigned char high = vec_ld(16, where);
      /* Prepare the constants that we need */
      vector unsigned char permuteVector = vec_lvsr(0, where);
      vector signed char oxFF = vec_splat_s8(-1);
      vector signed char ox00 = vec_splat_s8(0);
      /* Make a mask for which parts of the vectors to swap out */
      vector unsigned char mask = (vector unsigned char)vec_perm(ox00, oxFF, permuteVector);
      v = vec_perm(v, v, permuteVector);
      /* Insert our data into the low and high vectors */
      low = vec_sel(low, v, mask);
      high = vec_sel(v, high, mask);
      /* Store the two aligned result vectors */
      vec_st(low, 0, CONST_BUFFER(where));
      vec_st(high, 16, CONST_BUFFER(where));
    }
  else
    { /* prevent overflow */
      vec_st(v, 0, CONST_BUFFER(where));
    }
}
Пример #6
0
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
    int i;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst) & 15) //FIXME
        for(i = 0; i < len - 7; i += 8)
        {
            d0 = vec_ld(0, dst + i);
            d = float_to_int16_one_altivec(src + i);
            d1 = vec_ld(15, dst + i);
            d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
            align = vec_lvsr(0, dst + i);
            d0 = vec_perm(d1, d, align);
            d1 = vec_perm(d, d1, align);
            vec_st(d0, 0, dst + i);
            vec_st(d1, 15, dst + i);
        }
    else
        for(i = 0; i < len - 7; i += 8)
        {
            d = float_to_int16_one_altivec(src + i);
            vec_st(d, 0, dst + i);
        }
}
Пример #7
0
void float_to_int16_altivec(int16_t *dst, const float *src, int len)
{
    int i;
    vector float s0, s1;
    vector signed int t0, t1;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        d0 = vec_ld(0, dst+i);
        t1 = vec_cts(s1, 0);
        d1 = vec_ld(15, dst+i);
        d = vec_packs(t0,t1);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        t1 = vec_cts(s1, 0);
        d = vec_packs(t0,t1);
        vec_st(d, 0, dst+i);
    }
}
Пример #8
0
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
{
    vector float zero, t0, t1, s0, s1, wi, wj;
    const vector unsigned char reverse = vcprm(3,2,1,0);
    int i,j;

    dst += len;
    win += len;
    src0+= len;

    zero = (vector float)vec_splat_u32(0);

    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
        s0 = vec_ld(i, src0);
        s1 = vec_ld(j, src1);
        wi = vec_ld(i, win);
        wj = vec_ld(j, win);

        s1 = vec_perm(s1, s1, reverse);
        wj = vec_perm(wj, wj, reverse);

        t0 = vec_madd(s0, wj, zero);
        t0 = vec_nmsub(s1, wi, t0);
        t1 = vec_madd(s0, wi, zero);
        t1 = vec_madd(s1, wj, t1);
        t1 = vec_perm(t1, t1, reverse);

        vec_st(t0, i, dst);
        vec_st(t1, j, dst);
    }
}
Пример #9
0
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

    /*
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    */
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
        
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
}
Пример #10
0
void
gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d,alpha_a,alpha_b,alpha;
  vector unsigned short al,ah;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      al=vec_mule(a,b);
      al=vec_add(al,ox0080);
      ah=vec_mulo(a,b);
      ah=vec_add(ah,ox0080);
      al=vec_add(al,vec_sr(al,ox0008));
      ah=vec_add(ah,vec_sr(ah,ox0008));
      d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      alpha=vec_min(alpha_a, alpha_b);

      d=vec_andc(d, alphamask);
      d=vec_or(d, alpha);

      StoreUnaligned(d, D);

      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  /* process last pixels */
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  al=vec_mule(a,b);
  al=vec_add(al,ox0080);
  ah=vec_mulo(a,b);
  ah=vec_add(ah,ox0080);
  al=vec_add(al,vec_sr(al,ox0008));
  ah=vec_add(ah,vec_sr(ah,ox0008));
  d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes);

  alpha_a=vec_and(a, alphamask);
  alpha_b=vec_and(b, alphamask);
  alpha=vec_min(alpha_a, alpha_b);

  d=vec_andc(d, alphamask);
  d=vec_or(d, alpha);

  StoreUnalignedLess(d, D, length);
}
Пример #11
0
        SIMD_INLINE void DeinterleavedUv(const Loader<align> & uv, Storer<align> & u, Storer<align> & v)
        {
            v128_u8 _uv0 = Load<align, first>(uv);
            v128_u8 _uv1 = Load<align, false>(uv);

            Store<align, first>(u, vec_perm(_uv0, _uv1, K8_PERM_U));
            Store<align, first>(v, vec_perm(_uv0, _uv1, K8_PERM_V));
        }
Пример #12
0
        SIMD_INLINE void InterleavedUv(const Loader<align> & u, const Loader<align> & v, Storer<align> & uv)
        {
            v128_u8 _u = Load<align, first>(u);
            v128_u8 _v = Load<align, first>(v);

            Store<align, first>(uv, vec_perm(_u, _v, K8_PERM_UV0));
            Store<align, false>(uv, vec_perm(_u, _v, K8_PERM_UV1));
        }
Пример #13
0
void abcd2cbad_double( ILdouble *tdata, ILuint length, ILdouble *tnewdata ) {
	register ILubyte *data = (ILubyte*)tdata;
	register ILubyte *newdata = (ILubyte*)tnewdata;
	const vector unsigned char p = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F);
	register vector unsigned char d0,d1,d2,d3,t0,t1,t2,t3;
	
	length = eround16(length);
	
	if( length >= 4 ) {
		length -= 4;
		
		d3 = vec_ld(48,data);
		d2 = vec_ld(32,data);
		d1 = vec_ld(16,data);
		d0 = vec_ld(0,data);
		
		while( length >= 4 ) {
			t0 = vec_perm(d0,d1,p);
			t1 = vec_perm(d1,d0,p);
			t2 = vec_perm(d2,d3,p);
			t3 = vec_perm(d3,d2,p);
			
			vec_st(t0,0,newdata);
			vec_st(t1,16,newdata);
			vec_st(t2,32,newdata);
			vec_st(t3,48,newdata);
			
			length -= 4;
			data += 16*4;
			newdata += 16*4;
			
			d3 = vec_ld(48,data);
			d2 = vec_ld(32,data);
			d1 = vec_ld(16,data);
			d0 = vec_ld(0,data);
		}
		t0 = vec_perm(d0,d1,p);
		t1 = vec_perm(d1,d0,p);
		t2 = vec_perm(d2,d3,p);
		t3 = vec_perm(d3,d2,p);
		
		vec_st(d0,0,newdata);
		vec_st(d1,16,newdata);
		vec_st(d2,32,newdata);
		vec_st(d3,48,newdata);
	}
	
	if( length == 2 ) {
		d0 = vec_ld(0,data);
		d1 = vec_ld(16,data);
		
		t0 = vec_perm(d0,d1,p);
		t1 = vec_perm(d1,d0,p);
		
		vec_st(t0,0,newdata);
		vec_st(t1,16,newdata);
	}
}
Пример #14
0
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                                  long len, int channels)
{
    int i;
    vector signed short d0, d1, d2, c0, c1, t0, t1;
    vector unsigned char align;
    if(channels == 1)
        float_to_int16_altivec(dst, src[0], len);
    else if (channels == 2)
    {
        if(((long)dst) & 15)
            for(i = 0; i < len - 7; i += 8)
            {
                d0 = vec_ld(0, dst + i);
                t0 = float_to_int16_one_altivec(src[0] + i);
                d1 = vec_ld(31, dst + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                c0 = vec_mergeh(t0, t1);
                c1 = vec_mergel(t0, t1);
                d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
                align = vec_lvsr(0, dst + i);
                d0 = vec_perm(d2, c0, align);
                d1 = vec_perm(c0, c1, align);
                vec_st(d0,  0, dst + i);
                d0 = vec_perm(c1, d2, align);
                vec_st(d1, 15, dst + i);
                vec_st(d0, 31, dst + i);
                dst += 8;
            }
        else
            for(i = 0; i < len - 7; i += 8)
            {
                t0 = float_to_int16_one_altivec(src[0] + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                d0 = vec_mergeh(t0, t1);
                d1 = vec_mergel(t0, t1);
                vec_st(d0,  0, dst + i);
                vec_st(d1, 16, dst + i);
                dst += 8;
            }
    }
    else
    {
        DECLARE_ALIGNED(16, int16_t, tmp)[len];
        int c, j;
        for (c = 0; c < channels; c++)
        {
            float_to_int16_altivec(tmp, src[c], len);
            for (i = 0, j = c; i < len; i++, j += channels)
            {
                dst[j] = tmp[i];
            }
        }
    }
}
Пример #15
0
void
jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
                               JDIMENSION v_samp_factor,
                               JDIMENSION width_blocks,
                               JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  int outrow, outcol;
  JDIMENSION output_cols = width_blocks * DCTSIZE;
  JSAMPROW inptr, outptr;

  __vector unsigned char this0, next0, out;
  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;

  /* Constants */
  __vector unsigned short pw_bias = { __4X2(0, 1) },
    pw_one = { __8X(1) };
  __vector unsigned char even_odd_index =
    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
    pb_zero = { __16X(0) };

  expand_right_edge(input_data, max_v_samp_factor, image_width,
                    output_cols * 2);

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr = input_data[outrow];

    for (outcol = output_cols; outcol > 0;
         outcol -= 16, inptr += 32, outptr += 16) {

      this0 = vec_ld(0, inptr);
      this0 = vec_perm(this0, this0, even_odd_index);
      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
      outl = vec_add(this0e, this0o);
      outl = vec_add(outl, pw_bias);
      outl = vec_sr(outl, pw_one);

      if (outcol > 8) {
        next0 = vec_ld(16, inptr);
        next0 = vec_perm(next0, next0, even_odd_index);
        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
        outh = vec_add(next0e, next0o);
        outh = vec_add(outh, pw_bias);
        outh = vec_sr(outh, pw_one);
      } else
        outh = vec_splat_u16(0);

      out = vec_pack(outl, outh);
      vec_st(out, 0, outptr);
    }
  }
}
Пример #16
0
void v_store_interleave_f32(float *ptr, vector float a, vector float b, vector float c)
{
    vector float hbc = vec_mergeh(b, c);

    static const vector unsigned char ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};
    vec_xst(vec_perm(a, hbc, ahbc),  0, ptr);

    vector float lab = vec_mergel(a, b);
    vec_xst(vec_sld(lab, hbc, 8), 16, ptr);

    static const vector unsigned char clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};
    vec_xst(vec_perm(c, lab, clab), 32, ptr);
}
Пример #17
0
/* Store a float vector to a potentially unaligned address */
void store_unaligned(float *target, vector float src) {
  vector float msq, lsq, edges;
  vector unsigned char edgeAlign, align;
  msq = vec_ld(0, target);               // most significant quadword
  lsq = vec_ld(15, target);              // least significant quadword
  edgeAlign = vec_lvsl(0, target);       // permute map to extract edges
  edges = vec_perm(lsq, msq, edgeAlign); // extract the edges
  align = vec_lvsr(0, target);           // permute map to misalign data
  msq = vec_perm(edges, src, align);     // misalign the data (msq)
  lsq = vec_perm(src, edges, align);     // misalign the data (lsq)
  vec_st(lsq, 15, target);               // Store the lsq part first
  vec_st(msq, 0, target);                // Store the msq part
}
Пример #18
0
        SIMD_INLINE void Bgr48pToBgra32(const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, 
            const v128_u8 & alpha, Storer<align> & bgra)
        {
            const v128_u8 _blue = Load<align>(blue + offset);
            const v128_u8 _green = Load<align>(green + offset);
            const v128_u8 _red = Load<align>(red + offset);

            v128_u16 bg = (v128_u16)vec_perm(_blue, _green, K8_PERM_48);
            v128_u16 ra = (v128_u16)vec_perm(_red, alpha, K8_PERM_48);

            Store<align, first>(bgra, (v128_u8)UnpackLoU16(ra, bg));
            Store<align, false>(bgra, (v128_u8)UnpackHiU16(ra, bg));
        }
static void unaligned_store(vector signed short value, int off, int16_t *dst)
{
    register vector unsigned char align = vec_lvsr(0, dst),
                                  mask = vec_lvsl(0, dst);
    register vector signed short t0,t1, edges;

    t0 = vec_ld(0+off, dst);
    t1 = vec_ld(15+off, dst);
    edges = vec_perm(t1 ,t0, mask);
    t1 = vec_perm(value, edges, align);
    t0 = vec_perm(edges, value, align);
    vec_st(t1, 15+off, dst);
    vec_st(t0, 0+off, dst);
}
Пример #20
0
void v_load_deinterleave_f32(float *ptr, vector float* a, vector float* b, vector float* c)
{
    vector float v1 = vec_xl( 0, ptr);
    vector float v2 = vec_xl(16, ptr);
    vector float v3 = vec_xl(32, ptr);

    static const vector unsigned char flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};
    *a = vec_perm(v1, vec_sld(v3, v2, 8), flp);

    static const vector unsigned char flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};
    *b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);

    *c = vec_perm(vec_sld(v2, v1, 8), v3, flp);
}
Пример #21
0
/**
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the pix_abs8x8_altivec code above w/ squaring added.
 */
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;
    
    sum = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    
    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

        /*
          Since we want to use unsigned chars, we can take advantage
          of the fact that abs(a-b)^2 = (a-b)^2.
        */
        
	/* Calculate abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
        
        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);
    
    return s;
}
Пример #22
0
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
    vec_s16 dc16;
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
    vec_s32 v_dc32;
    LOAD_ZERO;
    DECLARE_ALIGNED(16, int, dc);
    int i;

    dc = (block[0] + 32) >> 6;
    block[0] = 0;
    v_dc32 = vec_lde(0, &dc);
    dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);

    if (size == 4)
        dc16 = VEC_SLD16(dc16, zero_s16v, 8);
    dcplus = vec_packsu(dc16, zero_s16v);
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);

    aligner = vec_lvsr(0, dst);
#if !HAVE_BIGENDIAN
    aligner = vec_perm(aligner, zero_u8v, vcswapc());
#endif
    dcplus = vec_perm(dcplus, dcplus, aligner);
    dcminus = vec_perm(dcminus, dcminus, aligner);

    for (i = 0; i < size; i += 4) {
        v0 = vec_ld(0, dst+0*stride);
        v1 = vec_ld(0, dst+1*stride);
        v2 = vec_ld(0, dst+2*stride);
        v3 = vec_ld(0, dst+3*stride);

        v0 = vec_adds(v0, dcplus);
        v1 = vec_adds(v1, dcplus);
        v2 = vec_adds(v2, dcplus);
        v3 = vec_adds(v3, dcplus);

        v0 = vec_subs(v0, dcminus);
        v1 = vec_subs(v1, dcminus);
        v2 = vec_subs(v2, dcminus);
        v3 = vec_subs(v3, dcminus);

        vec_st(v0, 0, dst+0*stride);
        vec_st(v1, 0, dst+1*stride);
        vec_st(v2, 0, dst+2*stride);
        vec_st(v3, 0, dst+3*stride);

        dst += 4*stride;
    }
}
Пример #23
0
static inline void abcd2cbad_internal( register const vector unsigned char p, unsigned char *data, unsigned int length, unsigned char *newdata ) {
	register vector unsigned char d0,d1,d2,z;
	z = vec_splat_u8(0);
	
	length = eround16(length);
	
	if( length >= 3 ) {
		length -= 3;
		
		d2 = vec_ld(32,data);
		d1 = vec_ld(16,data);
		d0 = vec_ld(0,data);
		
		while( length >= 3 ) {
			d0 = vec_perm(d0,z,p);
			d1 = vec_perm(d1,z,p);
			d2 = vec_perm(d2,z,p);
			
			vec_st(d0,0,newdata);
			vec_st(d1,16,newdata);
			vec_st(d2,32,newdata);
			
			length -= 3;
			data += 16*3;
			newdata += 16*3;
			
			d2 = vec_ld(32,data);
			d1 = vec_ld(16,data);
			d0 = vec_ld(0,data);
		}
		d0 = vec_perm(d0,z,p);
		d1 = vec_perm(d1,z,p);
		d2 = vec_perm(d2,z,p);
		
		vec_st(d0,0,newdata);
		vec_st(d1,16,newdata);
		vec_st(d2,32,newdata);
	}
	
	if( length == 2 ) {
		d0 = vec_ld(0,data);
		d1 = vec_ld(16,data);
		
		d0 = vec_perm(d0,z,p);
		d1 = vec_perm(d1,z,p);
		
		vec_st(d0,0,newdata);
		vec_st(d1,16,newdata);
	} else if( length == 1 ) {
		d0 = vec_ld(0,data);
		d0 = vec_perm(d0,d0,z);
		vec_st(d0,0,newdata);
	}
}
Пример #24
0
        SIMD_INLINE void BgrToGray(const Loader<align> & bgr, Storer<align> & gray)
        {
            v128_u8 _bgr[3];
            _bgr[0] = Load<align, first>(bgr);
            _bgr[1] = Load<align, false>(bgr);
            _bgr[2] = Load<align, false>(bgr);

            const v128_u16 lo = vec_packsu(
                BgraToGray32(vec_perm(_bgr[0], _bgr[1], K8_PERM_0)), 
                BgraToGray32(vec_perm(_bgr[0], _bgr[1], K8_PERM_1)));
            const v128_u16 hi = vec_packsu(
                BgraToGray32(vec_perm(_bgr[1], _bgr[2], K8_PERM_2)), 
                BgraToGray32(vec_perm(_bgr[1], _bgr[2], K8_PERM_3)));
            Store<align, first>(gray, vec_packsu(lo, hi));
        }
Пример #25
0
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Пример #26
0
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
static void _twin_fbdev_vec_put_span (twin_coord_t    left,
				      twin_coord_t    top,
				      twin_coord_t    right,
				      twin_argb32_t   *pixels,
				      void     	      *closure)
{
	twin_fbdev_t    	*tf = closure;
	twin_coord_t    	width = right - left;
	unsigned int		*dest;
	vector unsigned char 	edgeperm;
	vector unsigned char	src0v, src1v, srcv;

	if (!tf->active || tf->fb_base == MAP_FAILED)
		return;

	dest = (unsigned int *)(tf->fb_ptr + top * tf->fb_fix.line_length);
	dest += left;

	while((((unsigned long)dest) & 0xf) && width--)
		*(dest++) = *(pixels++);

	edgeperm = vec_lvsl (0, pixels);
	src0v = vec_ld (0, pixels);
	while(width >= 4) {
		src1v = vec_ld (16, pixels);
		srcv = vec_perm (src0v, src1v, edgeperm);
		vec_st ((vector unsigned int)srcv, 0, dest);
		src0v = src1v;
		dest += 4;
		pixels += 4;
		width -= 4;
	}
	while(width--)
		*(dest++) = *(pixels++);
}
Пример #28
0
int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;
    
    sv = (vector unsigned int)vec_splat_u32(0);
    
    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        tv = (vector unsigned char *) pix;
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Пример #29
0
void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
{
    vec_s16_t diff[2];
    vec_s32_t sum[2];
    vec_s32_t zero32 = vec_splat_s32(0);
    vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };

    pix_diff( &pix1[0], &pix2[0], diff, 0 );
    pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 );

    sum[0] = vec_sum4s( diff[0], zero32 );
    sum[1] = vec_sum4s( diff[1], zero32 );
    diff[0] = vec_packs( sum[0], sum[1] );
    sum[0] = vec_sum4s( diff[0], zero32 );
    diff[0] = vec_packs( sum[0], zero32 );

    diff[1] = vec_vsx_ld( 0, dct );
    diff[0] = vec_perm( diff[0], diff[1], mask );

    vec_vsx_st( diff[0], 0, dct );

    /* 2x2 DC transform */
    int d0 = dct[0] + dct[1];
    int d1 = dct[2] + dct[3];
    int d2 = dct[0] - dct[1];
    int d3 = dct[2] - dct[3];
    dct[0] = d0 + d1;
    dct[1] = d0 - d1;
    dct[2] = d2 + d3;
    dct[3] = d2 - d3;
}
Пример #30
0
static int pix_sum_altivec(uint8_t * pix, int line_size)
{
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char t1;
    vector unsigned int sad;
    vector signed int sumdiffs;

    int i;
    int s;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1 */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        t1 = vec_perm(pixl, pixr, perm);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t1, sad);

        pix += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}