Ejemplo n.º 1
0
int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;
    
    sv = (vector unsigned int)vec_splat_u32(0);
    
    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        tv = (vector unsigned char *) pix;
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Ejemplo n.º 2
0
static int pix_sum_altivec(uint8_t * pix, int line_size)
{
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char t1;
    vector unsigned int sad;
    vector signed int sumdiffs;

    int i;
    int s;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1 */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        t1 = vec_perm(pixl, pixr, perm);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t1, sad);

        pix += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Ejemplo n.º 3
0
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}
Ejemplo n.º 4
0
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor)
{
	const VECU32 shift = vec_splat_u32(-16);
	const VECS32 scale1 = { factor, factor, factor, factor };
	const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, };

	VECU32 temp = vec_msum((VECU16)m_value, (VECU16)vec_rl(scale1, shift), vec_splat_u32(0));
	temp = vec_msum((VECU16)other.m_value, (VECU16)vec_rl(scale2, shift), temp);

	m_value = vec_msum((VECU16)m_value, (VECU16)scale1, vec_mulo((VECU16)other.m_value, (VECU16)scale2));
	m_value = vec_add(vec_sl(temp, shift), (VECU32)m_value);
	sra(8);
}
Ejemplo n.º 5
0
/**
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the pix_abs8x8_altivec code above w/ squaring added.
 */
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;
    
    sum = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    
    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

        /*
          Since we want to use unsigned chars, we can take advantage
          of the fact that abs(a-b)^2 = (a-b)^2.
        */
        
	/* Calculate abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
        
        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);
    
    return s;
}
Ejemplo n.º 6
0
static inline
vec_uint4 GENBX(vec_uint4 a, vec_uint4 b, vec_uint4 c)
{
  return vec_and(vec_or(vec_cmpgt(a, b),
			vec_and(vec_cmpeq(a, b), c)),
		 vec_splat_u32(1));
}
Ejemplo n.º 7
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp = om->tfv + 7*q;    	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u;
  float   path[4];
  int     state[4] = { p7T_B, p7T_M, p7T_I, p7T_D };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  
  
  u.v = vec_madd(xBv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[1] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov); tp++;  path[2] = u.p[r];
  u.v = vec_madd(dpv, *tp, zerov);        path[3] = u.p[r];
  esl_vec_FNorm(path, 4);
  return state[esl_rnd_FChoose(rng, path, 4)];
}
Ejemplo n.º 8
0
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths;
 * instead we use the fact that E(i) is itself the necessary normalization
 * factor, and implement FChoose's algorithm here for an on-the-fly 
 * calculation.
 */
static inline int
select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k)
{
  int          Q     = p7O_NQF(ox->M);
  double       sum   = 0.0;
  double       roll  = esl_random(rng);
  double       norm  = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E];   /* all M, D already scaled exactly the same */
  vector float xEv   = esl_vmx_set_float(norm);
  vector float zerov = (vector float) vec_splat_u32(0);
  union { vector float v; float p[4]; } u;
  int    q,r;

  while (1) {
    for (q = 0; q < Q; q++)
      {
	u.v = vec_madd(ox->dpf[i][q*3 + p7X_M], xEv, zerov);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;}
	}

	u.v = vec_madd(ox->dpf[i][q*3 + p7X_D], xEv, zerov);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;}
	}
      }
    ESL_DASSERT1(sum > 0.99);
  }
  /*UNREACHED*/
  ESL_EXCEPTION(-1, "unreached code was reached. universe collapses.");
} 
Ejemplo n.º 9
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp    = om->tfv + 7*q;       	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u, tv;
  float   path[4];
  int     state[4] = { p7T_M, p7T_I, p7T_D, p7T_B };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  

  /* paths are numbered so that most desirable choice in case of tie is first. */
  u.v = xBv;  tv.v = *tp;  path[3] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = mpv;  tv.v = *tp;  path[0] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = ipv;  tv.v = *tp;  path[1] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  tp++;
  u.v = dpv;  tv.v = *tp;  path[2] = ((tv.p[r] == 0.0) ?  -eslINFINITY : u.p[r]);  
  return state[esl_vec_FArgMax(path, 4)];
}
Ejemplo n.º 10
0
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
{
    vector float zero, t0, t1, s0, s1, wi, wj;
    const vector unsigned char reverse = vcprm(3,2,1,0);
    int i,j;

    dst += len;
    win += len;
    src0+= len;

    zero = (vector float)vec_splat_u32(0);

    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
        s0 = vec_ld(i, src0);
        s1 = vec_ld(j, src1);
        wi = vec_ld(i, win);
        wj = vec_ld(j, win);

        s1 = vec_perm(s1, s1, reverse);
        wj = vec_perm(wj, wj, reverse);

        t0 = vec_madd(s0, wj, zero);
        t0 = vec_nmsub(s1, wi, t0);
        t1 = vec_madd(s0, wi, zero);
        t1 = vec_madd(s1, wj, t1);
        t1 = vec_perm(t1, t1, reverse);

        vec_st(t0, i, dst);
        vec_st(t1, j, dst);
    }
}
Ejemplo n.º 11
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float zerov;
  union { vector float v; float p[4]; } mpv, dpv, tmdv, tddv;
  float   path[2];

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv.v  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv.v  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv.v = om->tfv[7*(q-1) + p7O_MD];
    tddv.v = om->tfv[7*Q + (q-1)];
  } else {
    mpv.v  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv.v  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv.v = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv.v = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  path[0] = ((tmdv.p[r] == 0.0) ? -eslINFINITY : mpv.p[r]);
  path[1] = ((tddv.p[r] == 0.0) ? -eslINFINITY : dpv.p[r]);
  return  ((path[0] >= path[1]) ? p7T_M : p7T_D);
}
Ejemplo n.º 12
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float  zerov;
  vector float  mpv, dpv;
  vector float  tmdv, tddv;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_D };

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv = om->tfv[7*(q-1) + p7O_MD];
    tddv = om->tfv[7*Q + (q-1)];
  } else {
    mpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r];
  u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Ejemplo n.º 13
0
static void vector_fmul_reverse_altivec(float *dst, const float *src0,
                                        const float *src1, int len)
{
    int i;
    vector float d, s0, s1, h0, l0,
                 s2, s3, zero = (vector float)vec_splat_u32(0);
    src1 += len-4;
    for(i=0; i<len-7; i+=8) {
        s1 = vec_ld(0, src1-i);              // [a,b,c,d]
        s0 = vec_ld(0, src0+i);
        l0 = vec_mergel(s1, s1);             // [c,c,d,d]
        s3 = vec_ld(-16, src1-i);
        h0 = vec_mergeh(s1, s1);             // [a,a,b,b]
        s2 = vec_ld(16, src0+i);
        s1 = vec_mergeh(vec_mergel(l0,h0),   // [d,b,d,b]
                        vec_mergeh(l0,h0));  // [c,a,c,a]
                                             // [d,c,b,a]
        l0 = vec_mergel(s3, s3);
        d = vec_madd(s0, s1, zero);
        h0 = vec_mergeh(s3, s3);
        vec_st(d, 0, dst+i);
        s3 = vec_mergeh(vec_mergel(l0,h0),
                        vec_mergeh(l0,h0));
        d = vec_madd(s2, s3, zero);
        vec_st(d, 16, dst+i);
    }
}
Ejemplo n.º 14
0
static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
                           int big_endian, int output_bits)
{
    const int dst_u = -(uintptr_t)dest & 7;
    const int shift = 3;
    const int add = (1 << (shift - 1));
    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
    const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
    const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
    vector uint32_t v, v2;
    vector uint16_t vd;
    int i;

    yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);

    for (i = dst_u; i < dstW - 7; i += 8) {
        v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
        v = vec_add(v, vadd);
        v = vec_sr(v, vshift);

        v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
        v2 = vec_add(v2, vadd);
        v2 = vec_sr(v2, vshift);

        vd = vec_packsu(v, v2);
        vd = vec_rl(vd, vswap);

        vec_st(vd, 0, &dest[i]);
    }

    yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
}
Ejemplo n.º 15
0
uint32_t
sad8_altivec_c(const uint8_t * cur,
	   const uint8_t *ref,
	   const uint32_t stride)
{
	uint32_t result = 0;
	
	register vector unsigned int sad;
	register vector unsigned char c;
	register vector unsigned char r;
	
	/* initialize */
	sad = vec_splat_u32(0);
	
	/* Perform sad operations */
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	/* finish addition, add the first 2 together */
	sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0)));
	sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0));
	sad = vec_splat(sad,3);
	vec_ste(sad, 0, &result);
		
	return result;
}
Ejemplo n.º 16
0
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

    /*
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    */
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
        
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
}
Ejemplo n.º 17
0
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor)
{
	const VECU32 shift = vec_splat_u32(-16);
	const VECS32 scale1 = { factor, factor, factor, factor };
	const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, };

	VECU32 temp = vec_msum(VECU16(m_value), VECU16(vec_rl(scale1, shift)), vec_splat_u32(0));
	temp = vec_msum(VECU16(other.m_value), VECU16(vec_rl(scale2, shift)), temp);

#if defined __LITTLE_ENDIAN__
	m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mule(VECU16(other.m_value), VECU16(scale2))));
#else
	m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mulo(VECU16(other.m_value), VECU16(scale2))));
#endif
	m_value = VECS32(vec_add(vec_sl(temp, shift), VECU32(m_value)));
	sra_imm(8);
}
Ejemplo n.º 18
0
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Ejemplo n.º 19
0
uint32_t
sad16_altivec_c(vector unsigned char *cur,
			  vector unsigned char *ref,
			  uint32_t stride,
			  const uint32_t best_sad)
{
	vector unsigned char perm;
	vector unsigned char t1, t2;
	vector unsigned int sad;
	vector unsigned int sumdiffs;
	vector unsigned int best_vec;
	uint32_t result;

        
#ifdef DEBUG
        /* print alignment errors if DEBUG is on */
	if (((unsigned long) cur) & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur);
	if (stride & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride);
#endif
	/* initialization */
	sad = vec_splat_u32(0);
	sumdiffs = sad;
	stride >>= 4;
	perm = vec_lvsl(0, (unsigned char *) ref);
	*((uint32_t*)&best_vec) = best_sad;
	best_vec = vec_splat(best_vec, 0);

	/* perform sum of differences between current and previous */
	SAD16();
	SAD16();
	SAD16();
	SAD16();

	SAD16();
	SAD16();
	SAD16();
	SAD16();

	SAD16();
	SAD16();
	SAD16();
	SAD16();
        
	SAD16();
	SAD16();
	SAD16();
	SAD16();

  bail:
	/* copy vector sum into unaligned result */
	sumdiffs = vec_splat(sumdiffs, 3);
	vec_ste(sumdiffs, 0, (uint32_t*) &result);
	return result;
}
Ejemplo n.º 20
0
uint32_t
dequant_mpeg_intra_altivec_c(int16_t * data,
					 const int16_t * coeff,
					 const uint32_t quant,
					 const uint32_t dcscalar,
					 const uint16_t * mpeg_quant_matrices)
{
	register const uint16_t *intra_matrix = get_intra_matrix(mpeg_quant_matrices);
	register const int16_t *coeff_ptr = coeff;
	register int16_t *data_ptr = data;
	
	register vec_sint16_t ox00;
	register vec_sint16_t level;
	register vec_sint16_t vec_2048;
	register vec_uint16_t vintra;
	register vec_uint32_t swap;
	register vec_uint32_t even,odd;
	register vec_uint32_t et,ot,t;
	
	vec_uint32_t vquant;
	vector bool short zero_less;
	vector bool short overflow;
	
#ifdef DEBUG
	if((long)data & 0xf)
		fprintf(stderr, "xvidcore: error in dequant_mpeg_intra_altivec_c, incorrect align: %x\n", data);
#endif

	/* Initialize */
	ox00 = vec_splat_s16(0);
	*((uint32_t*)&vquant) = quant;
	vquant = vec_splat(vquant,0);
	
	swap = vec_rl(vquant, vec_splat_u32(-16));
	vec_2048 = (vec_sint16_t)vec_rl(vec_splat_u16(8),vec_splat_u16(8));
	
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	DEQUANT_MPEG_INTRA();
	
	/* Process the first */
	data[0] = coeff[0] * dcscalar;
	if (data[0] < -2048) {
		data[0] = -2048;
	} else if (data[0] > 2047) {
		data[0] = 2047;
	}
		
	return 0;
}
Ejemplo n.º 21
0
void foo (void) 
{
  vector bool int boolVec1 = (vector bool int) vec_splat_u32(3);
  vector bool short boolVec2 = (vector bool short) vec_splat_u16(3);
  vector bool char boolVec3 = (vector bool char) vec_splat_u8(3);

  boolVec1 = vec_sld( boolVec1, boolVec1, 4 );
  boolVec2 = vec_sld( boolVec2, boolVec2, 2 );
  boolVec3 = vec_sld( boolVec3, boolVec3, 1 );
}
Ejemplo n.º 22
0
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
                                  const int16_t **src, uint8_t *dest,
                                  const uint8_t *dither, int offset, int x)
{
    register int i, j;
    DECLARE_ALIGNED(16, int, val)[16];
    vector signed int vo1, vo2, vo3, vo4;
    vector unsigned short vs1, vs2;
    vector unsigned char vf;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));

    for (i = 0; i < 16; i++)
        val[i] = dither[(x + i + offset) & 7] << 12;

    vo1 = vec_ld(0,  val);
    vo2 = vec_ld(16, val);
    vo3 = vec_ld(32, val);
    vo4 = vec_ld(48, val);

    for (j = 0; j < filterSize; j++) {
        vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
        vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
        vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
        vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter

        perm = vec_lvsl(x << 1, src[j]);
        l1   = vec_ld(x << 1, src[j]);

        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
    }

    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
    vs1 = vec_packsu(vo1, vo2);
    vs2 = vec_packsu(vo3, vo4);
    vf  = vec_packsu(vs1, vs2);
    vec_st(vf, 0, dest);
}
Ejemplo n.º 23
0
void
evas_common_cpu_altivec_test(void)
{
#ifdef __POWERPC__
#ifdef __VEC__
    vector unsigned int zero;

    zero = vec_splat_u32(0);
#endif /* __VEC__ */
#endif /* __POWERPC__ */
}
Ejemplo n.º 24
0
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
                                  const int16_t **src, uint8_t *dest,
                                  const uint8_t *dither, int offset, int x)
{
    register int i, j;
    LOCAL_ALIGNED(16, int, val, [16]);
    vector signed int vo1, vo2, vo3, vo4;
    vector unsigned short vs1, vs2;
    vector unsigned char vf;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));

    for (i = 0; i < 16; i++)
        val[i] = dither[(x + i + offset) & 7] << 12;

    vo1 = vec_ld(0,  val);
    vo2 = vec_ld(16, val);
    vo3 = vec_ld(32, val);
    vo4 = vec_ld(48, val);

    for (j = 0; j < filterSize; j++) {
        unsigned int joffset=j<<1;
        unsigned int xoffset=x<<1;
        vector unsigned char perm;
        vector signed short l1,vLumFilter;
        LOAD_FILTER(vLumFilter,filter);
        vLumFilter = vec_splat(vLumFilter, 0);
        LOAD_L1(l1,src[j],perm);
        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
    }

    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
    vs1 = vec_packsu(vo1, vo2);
    vs2 = vec_packsu(vo3, vo4);
    vf  = vec_packsu(vs1, vs2);
    VEC_ST(vf, 0, dest);
}
Ejemplo n.º 25
0
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    
    sad = (vector unsigned int)vec_splat_u32(0);


    for(i=0;i<16;i++) {
	/* Read potentially unaligned pixels into t1 and t2 */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
       
	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
	
	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    
    return s;
}
Ejemplo n.º 26
0
static inline void
altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW)
{
    register int i;
    vector unsigned int altivec_vectorShiftInt19 =
        vec_add(vec_splat_u32(10), vec_splat_u32(9));
    if ((uintptr_t)dest % 16) {
        /* badly aligned store, we force store alignment */
        /* and will handle load misalignment on val w/ vec_perm */
        vector unsigned char perm1;
        vector signed int v1;
        for (i = 0 ; (i < dstW) &&
            (((uintptr_t)dest + i) % 16) ; i++) {
                int t = val[i] >> 19;
                dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
        }
        perm1 = vec_lvsl(i << 2, val);
        v1 = vec_ld(i << 2, val);
        for ( ; i < (dstW - 15); i+=16) {
            int offset = i << 2;
            vector signed int v2 = vec_ld(offset + 16, val);
            vector signed int v3 = vec_ld(offset + 32, val);
            vector signed int v4 = vec_ld(offset + 48, val);
            vector signed int v5 = vec_ld(offset + 64, val);
            vector signed int v12 = vec_perm(v1, v2, perm1);
            vector signed int v23 = vec_perm(v2, v3, perm1);
            vector signed int v34 = vec_perm(v3, v4, perm1);
            vector signed int v45 = vec_perm(v4, v5, perm1);

            vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
            vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
            vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
            vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
            vector unsigned short vs1 = vec_packsu(vA, vB);
            vector unsigned short vs2 = vec_packsu(vC, vD);
            vector unsigned char vf = vec_packsu(vs1, vs2);
            vec_st(vf, i, dest);
            v1 = v5;
        }
    } else { // dest is properly aligned, great
Ejemplo n.º 27
0
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}
Ejemplo n.º 28
0
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);


    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2 */
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_ld(0, pix1);
        t2 = vec_perm(pix2l, pix2r, perm);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}
Ejemplo n.º 29
0
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                            intptr_t blocksize)
{
    int i;
    vector float m, a;
    vector bool int t0, t1;
    const vector unsigned int v_31 = //XXX
        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
    for (i = 0; i < blocksize; i += 4) {
        m = vec_ld(0, mag+i);
        a = vec_ld(0, ang+i);
        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
        t0 = (vector bool int)vec_and(a, t1);
        t1 = (vector bool int)vec_andc(a, t1);
        a = vec_sub(m, (vector float)t1);
        m = vec_add(m, (vector float)t0);
        vec_stl(a, 0, ang+i);
        vec_stl(m, 0, mag+i);
    }
}
Ejemplo n.º 30
0
static void vector_fmul_altivec(float *dst, const float *src, int len)
{
    int i;
    vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
    for(i=0; i<len-7; i+=8) {
        d0 = vec_ld(0, dst+i);
        s = vec_ld(0, src+i);
        d1 = vec_ld(16, dst+i);
        d0 = vec_madd(d0, s, zero);
        d1 = vec_madd(d1, vec_ld(16,src+i), zero);
        vec_st(d0, 0, dst+i);
        vec_st(d1, 16, dst+i);
    }
}